From 69f3721a36d20e83f9282cc7ff8f9d8154a3a59c Mon Sep 17 00:00:00 2001
From: chezhang <1376507468@qq.com>
Date: Thu, 4 Sep 2025 14:55:53 +0800
Subject: [PATCH 01/95] [fix] fix fail test when backend is mack

---
 .../batch_norm_kernel_register.cc             |   10 +-
 .../conv_transpose_grad_kernel_register.cu    |   40 -
 .../conv_transpose_grad_kernel_register.cu    | 1114 +++++++++++++++++
 .../impl/spectral_norm_grad_kernel_impl.h     |  130 --
 .../kernels/impl/spectral_norm_kernel_impl.h  |  182 ---
 backends/metax_gpu/kernels/metax_context.cc   |    1 +
 backends/metax_gpu/kernels/metax_context.h    |    1 +
 .../instance_norm_grad_kerne_registerl.cu     |  650 ++++++++++
 .../instance_norm_kernel_register.cu          |  253 ++++
 .../spectral_norm_grad_kernel_register.cu     |   22 +
 .../spectral_norm_kernel_register.cu          |   22 +
 backends/metax_gpu/patch/paddle.patch         |  462 +++++++
 12 files changed, 2534 insertions(+), 353 deletions(-)
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
 delete mode 100644 backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h
 delete mode 100644 backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu

diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc b/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc
index b12f208bec0..ac3d8b95062 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc
+++ b/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cc
@@ -20,4 +20,12 @@ PD_CUSTOM_KERNEL_REGISTER(batch_norm_infer,
                           ALL_LAYOUT,
                           phi::BatchNormInferKernel,
                           float,
-                          phi::dtype::float16) {}
+                          double,
+                          phi::dtype::bfloat16,
+                          phi::dtype::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu
deleted file mode 100644
index dacced51df4..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_grad_kernel_register.cu
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gpu/conv_transpose_grad_kernel.cu"  // NOLINT
-PD_CUSTOM_KERNEL_REGISTER(conv2d_transpose_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::Conv2dTransposeGradKernel,
-                          float,
-                          double) {}
-PD_CUSTOM_KERNEL_REGISTER(conv2d_transpose_double_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::Conv2dTransposeDoubleGradKernel,
-                          float,
-                          double) {}
-PD_CUSTOM_KERNEL_REGISTER(conv3d_transpose_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::Conv3dTransposeGradKernel,
-                          float,
-                          double) {}
-PD_CUSTOM_KERNEL_REGISTER(depthwise_conv2d_transpose_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::DepthwiseConv2dTransposeGradKernel,
-                          float,
-                          double) {}
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
new file mode 100644
index 00000000000..0067818d165
--- /dev/null
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
@@ -0,0 +1,1114 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+
+#include "kernels/gpudnn/conv_cudnn_v7.h"
+#include "kernels/metax_context.h"
+#include "paddle/common/ddim.h"
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/padding.h"
+#include "paddle/phi/kernels/funcs/slice.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+
+using GPUDNNDataLayout = phi::backends::gpu::DataLayout;
+
+template <typename T, typename Context>
+void ConvTransposeGradRawGPUDNNKernel(const Context& dev_ctx,
+                                      const DenseTensor& x,
+                                      const DenseTensor& filter,
+                                      const DenseTensor& dout,
+                                      const std::vector<int>& strides,
+                                      const std::vector<int>& paddings,
+                                      const std::string& padding_algorithm,
+                                      int groups,
+                                      const std::vector<int>& dilations,
+                                      const std::string& data_format,
+                                      DenseTensor* dx,
+                                      DenseTensor* dfilter) {
+  // 0-size
+  if (x.numel() == 0) {
+    if (dx) dev_ctx.template Alloc<T>(dx);
+    if (dfilter) {
+      phi::Full<T, Context>(dev_ctx,
+                            phi::IntArray(common::vectorize(dfilter->dims())),
+                            0,
+                            dfilter);
+    }
+    return;
+  }
+  if (filter.numel() == 0) {
+    if (dfilter) dev_ctx.template Alloc<T>(dfilter);
+    if (dx) {
+      phi::Full<T, Context>(
+          dev_ctx, phi::IntArray(common::vectorize(dx->dims())), 0, dx);
+    }
+    return;
+  }
+
+  const T* filter_data = filter.data<T>();
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ =
+      dilations;  // cudnn v5 does not support dilations
+  const GPUDNNDataLayout data_layout =
+      (data_format != "NHWC" ? GPUDNNDataLayout::kNCHW
+                             : GPUDNNDataLayout::kNHWC);
+
+  // if channel_last, transpose to channel_first
+  DenseTensor x_transpose;
+  DenseTensor dout_transpose;
+  std::vector<int> x_vec = common::vectorize<int>(x.dims());
+  std::vector<int> out_vec = common::vectorize<int>(dout.dims());
+  if (data_layout == GPUDNNDataLayout::kNHWC) {
+    if (strides.size() == 2U) {
+      std::vector<int> axis = {0, 3, 1, 2};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = dout.dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(dev_ctx, x, axis);
+      dout_transpose = Transpose<T, Context>(dev_ctx, dout, axis);
+    } else if (strides.size() == 3U) {
+      std::vector<int> axis = {0, 4, 1, 2, 3};
+      for (size_t i = 0; i < axis.size(); ++i) {
+        x_vec[i] = x.dims()[axis[i]];
+        out_vec[i] = dout.dims()[axis[i]];
+      }
+      x_transpose = Transpose<T, Context>(dev_ctx, x, axis);
+      dout_transpose = Transpose<T, Context>(dev_ctx, dout, axis);
+    }
+  } else {
+    x_transpose = x;
+    dout_transpose = dout;
+  }
+
+  // update padding and dilation
+  auto x_dims = x_transpose.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims;
+  x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+
+  std::vector<int> x_pad(x_dims.size() * 2, 0);
+  DenseTensor transformed_dout;
+  std::vector<int> padding_common(data_dim, 0);
+  if (!is_sys_pad) {
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_dout_shape_vec(data_dim + 2);
+    new_dout_shape_vec[0] = dout_transpose.dims()[0];
+    new_dout_shape_vec[1] = dout_transpose.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_dout_shape_vec[i + 2] =
+          dout_transpose.dims()[i + 2] + padding_diff[i];
+      x_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      x_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+
+    transformed_dout.Resize(common::make_ddim(new_dout_shape_vec));
+    dev_ctx.template Alloc<T>(&transformed_dout);
+
+    const int rank = x_transpose.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(
+            dev_ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(
+            dev_ctx, x_pad, dout_transpose, pad_value, &transformed_dout);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "Op(ConvTranspose) only supports 4-D or 5-D x DenseTensor."));
+    }
+  } else {
+    transformed_dout = dout_transpose;
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  const T* x_data = x_transpose.data<T>();
+  const T* dout_data = transformed_dout.data<T>();
+  out_vec = common::vectorize<int>(transformed_dout.dims());
+
+  // ------------------- cudnn descriptors ---------------------
+#ifndef PADDLE_WITH_HIP
+  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(transformed_dout);
+  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(filter);
+  CUDNN_ENFORCE_TENSOR_SIZE_SUPPORTED(x_transpose);
+#endif
+
+  GPUDNNDataLayout layout;
+
+  if (strides.size() == 2U) {
+    layout = GPUDNNDataLayout::kNCHW;
+  } else {
+    layout = GPUDNNDataLayout::kNCDHW;
+  }
+
+  int iwo_groups = groups;
+  int c_groups = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_groups = 1;
+  c_groups = groups;
+  groups = 1;
+#endif
+
+  auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  ConvArgs args1{handle,
+                 &transformed_dout,
+                 &filter,
+                 &x_transpose,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 layout};
+  ConvArgs args2{handle,
+                 &transformed_dout,
+                 &filter,
+                 &x_transpose,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 layout};
+
+#ifdef PADDLE_WITH_HIP
+  SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
+  SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
+#else
+  SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
+  SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
+#endif
+
+  auto layout_tensor = phi::backends::gpu::GetCudnnTensorFormat(layout);
+  size_t workspace_size = 0;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  T* dx_data = nullptr;
+  T* dfilter_data = nullptr;
+
+  if (dx) {
+    dx_data = dev_ctx.template Alloc<T>(dx);
+
+    args1.idesc.set(transformed_dout, iwo_groups);
+    args1.wdesc.set(filter, layout_tensor, iwo_groups);
+    args1.odesc.set(x_transpose, iwo_groups);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1));
+    fwd_result.algo =
+        search1::Find<T>(args1, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search1 = SearchAlgorithm<ConvKind::kForward>;
+    fwd_result = search1::Find<T>(dev_ctx, args1, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
+#endif
+  }
+
+  if (dfilter) {
+    dfilter_data = dev_ctx.template Alloc<T>(dfilter);
+
+    args2.idesc.set(transformed_dout, iwo_groups);
+    args2.wdesc.set(*dfilter, layout_tensor, iwo_groups);
+    args2.odesc.set(x_transpose, iwo_groups);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
+#ifdef PADDLE_WITH_HIP
+    using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
+    filter_result.algo =
+        search2::Find<T>(args2, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search2 = SearchAlgorithm<ConvKind::kBackwardFilter>;
+    filter_result =
+        search2::Find<T>(dev_ctx, args2, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
+#endif
+  }
+
+  // ------------------- cudnn conv backward data ---------------------
+  // FIxME(typhoonzero): template type T may not be the same as cudnn call.
+  int x_offset = x.numel() / x.dims()[0] / groups;
+  int dout_offset =
+      transformed_dout.numel() / transformed_dout.dims()[0] / groups;
+  int filter_offset = filter.numel() / groups;
+  ScalingParamType<T> alpha = 1.0f;
+  ScalingParamType<T> beta = 0.0f;
+  // auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+  auto workspace_handle = GetDnnWorkspace(
+      const_cast<Allocator*>(&(dev_ctx.GetAllocator())), dev_ctx.stream());
+  if (dx) {
+#ifdef PADDLE_WITH_HIP
+    // Because beta is zero, it is unnecessary to reset dx.
+    for (int g = 0; g < groups; g++) {
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            dynload::miopenConvolutionForward(handle,
+                                              &alpha,
+                                              args1.idesc.desc(),
+                                              dout_data + dout_offset * g,
+                                              args1.wdesc.desc(),
+                                              filter_data + filter_offset * g,
+                                              args1.cdesc.desc(),
+                                              fwd_result.algo,
+                                              &beta,
+                                              args1.odesc.desc(),
+                                              dx_data + x_offset * g,
+                                              cudnn_workspace,
+                                              workspace_size));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kForward>::Apply(dev_ctx,
+                                             args1,
+                                             fwd_result,
+                                             dout_data,
+                                             filter_data,
+                                             dx_data,
+                                             groups,
+                                             dout_offset,
+                                             filter_offset,
+                                             x_offset,
+                                             workspace_size,
+                                             &workspace_handle,
+                                             false);
+#endif  // PADDLE_WITH_HIP
+
+    if (data_layout == GPUDNNDataLayout::kNHWC) {
+      DenseTensor dx_transpose;
+      DenseTensor dx_nchw;
+      dx_nchw.ShareDataWith(*dx);
+      dx_nchw.Resize(common::make_ddim(x_vec));
+      if (strides.size() == 2U) {
+        std::vector<int> axis = {0, 2, 3, 1};
+        dx_transpose = Transpose<T, Context>(dev_ctx, dx_nchw, axis);
+        *dx = dx_transpose;
+      } else if (strides.size() == 3U) {
+        std::vector<int> axis = {0, 2, 3, 4, 1};
+        dx_transpose = Transpose<T, Context>(dev_ctx, dx_nchw, axis);
+        *dx = dx_transpose;
+      }
+    }
+  }
+
+  // ------------------- cudnn conv backward filter ---------------------
+  if (dfilter) {
+    // Because beta is zero, it is unnecessary to reset dfilter.
+    // Gradient with respect to the filter
+#ifdef PADDLE_WITH_HIP
+    for (int g = 0; g < groups; g++) {
+      auto cudnn_func = [&](void* cudnn_workspace) {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardWeights(
+            handle,
+            &alpha,
+            args2.odesc.desc(),
+            x_data + x_offset * g,
+            args2.idesc.desc(),
+            dout_data + dout_offset * g,
+            args2.cdesc.desc(),
+            filter_result.algo,
+            &beta,
+            args2.wdesc.desc(),
+            dfilter_data + filter_offset * g,
+            cudnn_workspace,
+            workspace_size));
+      };
+      workspace_handle.RunFunc(cudnn_func, workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardFilter>::Apply(dev_ctx,
+                                                    args2,
+                                                    filter_result,
+                                                    x_data,
+                                                    dout_data,
+                                                    dfilter_data,
+                                                    groups,
+                                                    dout_offset,
+                                                    filter_offset,
+                                                    x_offset,
+                                                    workspace_size,
+                                                    &workspace_handle,
+                                                    false);
+#endif  // PADDLE_WITH_HIP
+  }
+}
+
+template <typename T, typename Context>
+void Conv2dTransposeGradGPUDNNKernel(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings_,
+                                     const std::vector<int>& output_padding,
+                                     const IntArray& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations_,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter) {
+  ConvTransposeGradRawGPUDNNKernel<T, Context>(dev_ctx,
+                                               x,
+                                               filter,
+                                               dout,
+                                               strides,
+                                               paddings_,
+                                               padding_algorithm,
+                                               groups,
+                                               dilations_,
+                                               data_format,
+                                               dx,
+                                               dfilter);
+}
+
+/*
+ * Inputs:  I, filter, dout, ddI, ddfilter
+ * Outputs: ddout, dfilter, dI
+ * ddo = conv_bp_data(filter, ddI) + conv_bp_data(ddfilter, I)
+ * dfilter = conv_bp_filter(dout, ddI)
+ * dI = conv(dout, ddfilter)
+ */
+template <typename T, typename Context>
+void Conv2dTransposeDoubleGradGPUDNNKernel(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& filter,
+    const DenseTensor& dout,
+    const DenseTensor& ddx,
+    const DenseTensor& ddfilter,
+    const std::vector<int>& strides,
+    const std::vector<int>& paddings,
+    const std::vector<int>& output_padding,
+    const IntArray& output_size,
+    const std::string& padding_algorithm,
+    int groups,
+    const std::vector<int>& dilations,
+    const std::string& data_format,
+    DenseTensor* dx,
+    DenseTensor* dfilter,
+    DenseTensor* ddout) {
+  if (dx) {
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (dfilter) {
+    dev_ctx.template Alloc<T>(dfilter);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+    funcs::SetConstant<Context, T> set_zero;
+    set_zero(dev_ctx, ddout, static_cast<T>(0));
+  }
+
+  const T* filter_ = filter.data<T>();
+  const T* dout_ = dout.data<T>();
+  const T* ddx_ = nullptr;
+  const T* ddfilter_ = nullptr;
+  T* dx_ = nullptr;
+  T* dfilter_ = nullptr;
+  T* ddout_ = nullptr;
+  T* transformed_dx_ = nullptr;
+
+  std::vector<int> paddings_ = paddings;
+  std::vector<int> dilations_ = dilations;
+
+  bool deterministic = FLAGS_cudnn_deterministic;
+  const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
+
+  // transform DenseTensors to channel first-----------
+  DenseTensor transformed_x_channel(x.type());
+  DenseTensor transformed_dout_channel(dout.type());
+  DenseTensor transformed_ddx_channel(x.type());
+
+  DenseTensor transformed_dx_channel(x.type());
+  DenseTensor transformed_ddout_channel(dout.type());
+
+  if (channel_last) {
+    ResizeToChannelFirst<Context, T>(dev_ctx, &x, &transformed_x_channel);
+    TransToChannelFirst<Context, T>(dev_ctx, &x, &transformed_x_channel);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, &dout, &transformed_dout_channel);
+    TransToChannelFirst<Context, T>(dev_ctx, &dout, &transformed_dout_channel);
+
+    ResizeToChannelFirst<Context, T>(dev_ctx, &ddx, &transformed_ddx_channel);
+    TransToChannelFirst<Context, T>(dev_ctx, &ddx, &transformed_ddx_channel);
+
+    if (dx) {
+      ResizeToChannelFirst<Context, T>(dev_ctx, dx, &transformed_dx_channel);
+      dev_ctx.template Alloc<T>(&transformed_dx_channel);
+    }
+    if (ddout) {
+      ResizeToChannelFirst<Context, T>(
+          dev_ctx, ddout, &transformed_ddout_channel);
+    }
+  } else {
+    transformed_x_channel = x;
+    transformed_dout_channel = dout;
+    transformed_ddx_channel = ddx;
+
+    if (dx) {
+      transformed_dx_channel = *dx;
+    }
+  }
+  std::vector<int> out_vec =
+      common::vectorize<int>(transformed_dout_channel.dims());
+
+  auto x_dims = transformed_x_channel.dims();
+  auto filter_dims = filter.dims();
+  DDim x_data_dims = slice_ddim(x_dims, 2, x_dims.size());
+  DDim filter_data_dims = slice_ddim(filter_dims, 2, filter_dims.size());
+  std::vector<int> ksize = common::vectorize<int>(filter_data_dims);
+  UpdatePaddingAndDilation(
+      &paddings_, &dilations_, padding_algorithm, x_data_dims, strides, ksize);
+
+  int data_dim = strides.size();  // 2d or 3d
+  bool is_sys_pad = funcs::IsSymmetricPadding(paddings_, data_dim);
+  DenseTensor transformed_x(x.type());
+  DenseTensor transformed_ddx(x.type());
+
+  DenseTensor transformed_dout(dout.type());
+
+  std::vector<int> padding_common(data_dim, 0);
+  std::vector<int> input_pad(x.dims().size() * 2, 0);
+
+  if (!is_sys_pad) {
+    // get pad
+    std::vector<int> padding_diff(data_dim);
+    std::vector<int> new_input_shape_vec(data_dim + 2);
+    std::vector<int> new_output_grad_shape_vec(data_dim + 2);
+
+    new_input_shape_vec[0] = transformed_x_channel.dims()[0];
+    new_input_shape_vec[1] = transformed_x_channel.dims()[1];
+
+    new_output_grad_shape_vec[0] = transformed_dout_channel.dims()[0];
+    new_output_grad_shape_vec[1] = transformed_dout_channel.dims()[1];
+
+    for (size_t i = 0; i < data_dim; ++i) {
+      padding_diff[i] = std::abs(paddings_[2 * i] - paddings_[2 * i + 1]);
+      padding_common[i] = std::min(paddings_[2 * i], paddings_[2 * i + 1]);
+      new_input_shape_vec[i + 2] =
+          transformed_x_channel.dims()[i + 2] + padding_diff[i];
+
+      new_output_grad_shape_vec[i + 2] =
+          transformed_dout_channel.dims()[i + 2] + padding_diff[i];
+
+      input_pad[2 * i + 4] = paddings_[2 * i] - padding_common[i];
+      input_pad[2 * i + 4 + 1] = paddings_[2 * i + 1] - padding_common[i];
+    }
+    DDim new_input_shape(common::make_ddim(new_input_shape_vec));
+    transformed_x.Resize(new_input_shape);
+    transformed_ddx.Resize(new_input_shape);
+    transformed_dout.Resize(common::make_ddim(new_output_grad_shape_vec));
+
+    dev_ctx.template Alloc<T>(&transformed_x);
+    dev_ctx.template Alloc<T>(&transformed_ddx);
+    dev_ctx.template Alloc<T>(&transformed_dout);
+
+    // pad for input
+    const int rank = x.dims().size();
+    T pad_value(0.0);
+    switch (rank) {
+      case 4: {
+        funcs::PadFunction<Context, T, 4>(dev_ctx,
+                                          input_pad,
+                                          transformed_x_channel,
+                                          pad_value,
+                                          &transformed_x);
+        funcs::PadFunction<Context, T, 4>(dev_ctx,
+                                          input_pad,
+                                          transformed_dout_channel,
+                                          pad_value,
+                                          &transformed_dout);
+        funcs::PadFunction<Context, T, 4>(dev_ctx,
+                                          input_pad,
+                                          transformed_ddx_channel,
+                                          pad_value,
+                                          &transformed_ddx);
+      } break;
+      case 5: {
+        funcs::PadFunction<Context, T, 5>(dev_ctx,
+                                          input_pad,
+                                          transformed_x_channel,
+                                          pad_value,
+                                          &transformed_x);
+        funcs::PadFunction<Context, T, 5>(dev_ctx,
+                                          input_pad,
+                                          transformed_ddx_channel,
+                                          pad_value,
+                                          &transformed_ddx);
+      } break;
+      default:
+        PADDLE_THROW(errors::InvalidArgument(
+            "ConvOp only support tensors with 4 or 5 dimensions."));
+    }
+  } else {
+    transformed_x = transformed_x_channel;
+    transformed_dout = transformed_dout_channel;
+    transformed_ddx = transformed_ddx_channel;
+
+    if (paddings_.size() == data_dim) {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[i];
+      }
+    } else {
+      for (size_t i = 0; i < data_dim; ++i) {
+        padding_common[i] = paddings_[2 * i];
+      }
+    }
+  }
+
+  std::vector<int64_t> starts(data_dim, 0);
+  std::vector<int64_t> ends(data_dim, 0);
+  std::vector<int64_t> axes(data_dim, 0);
+  for (size_t i = 0; i < data_dim; ++i) {
+    starts[i] = input_pad[2 * i + 4] * (strides[i] + 1);
+    ends[i] = starts[i] + out_vec[i + 2];
+    axes[i] = i + 2;
+  }
+
+  std::vector<int> transformed_out_vec = out_vec;
+  for (size_t i = 0; i < data_dim; ++i) {
+    transformed_out_vec[i + 2] =
+        out_vec[i + 2] +
+        (input_pad[2 * i + 4] + input_pad[2 * i + 5]) * strides[i] -
+        2 * padding_common[i] + paddings_[2 * i] + paddings_[2 * i + 1];
+  }
+
+  if (!is_sys_pad) {
+    transformed_ddout_channel.Resize(common::make_ddim(transformed_out_vec));
+    dev_ctx.template Alloc<T>(&transformed_ddout_channel);
+  } else {
+    dev_ctx.template Alloc<T>(ddout);
+    transformed_ddout_channel = *ddout;
+    transformed_ddout_channel.Resize(common::make_ddim(transformed_out_vec));
+  }
+
+  const T* x_ = transformed_x.data<T>();
+
+  int iwo_group = groups;
+  int c_group = 1;
+#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1)
+  iwo_group = 1;
+  c_group = groups;
+  groups = 1;
+#endif
+  auto dtype = phi::backends::gpu::CudnnDataType<T>::type;
+
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  auto layout =
+      phi::backends::gpu::GetCudnnTensorFormat(GPUDNNDataLayout::kNCHW);
+
+  ConvArgs args1{handle,
+                 &transformed_ddout_channel,
+                 &filter,
+                 &transformed_ddx,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+  ConvArgs args2{handle,
+                 &transformed_ddout_channel,
+                 &ddfilter,
+                 &transformed_x,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+
+  ConvArgs args3{handle,
+                 &transformed_dout,
+                 dfilter,
+                 &transformed_ddx_channel,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+  ConvArgs args4{handle,
+                 &transformed_dout,
+                 &ddfilter,
+                 &transformed_dx_channel,
+                 strides,
+                 padding_common,
+                 dilations_,
+                 dtype,
+                 groups,
+                 GPUDNNDataLayout::kNCHW};
+#ifdef PADDLE_WITH_HIP
+  SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result1;
+  SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result2;
+  SearchResult<miopenConvBwdWeightsAlgorithm_t> filter_result;
+  SearchResult<miopenConvFwdAlgorithm_t> fwd_result;
+#else
+  SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result1;
+  SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result2;
+  SearchResult<cudnnConvolutionBwdFilterAlgo_t> filter_result;
+  SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
+#endif
+
+  // ddo = conv(ddI, filter) + conv(I, ddfilter)
+  size_t workspace_size = 0;
+
+  T* transformed_ddout_channel_ = nullptr;
+
+  if (ddout) {
+    ddout_ = ddout->data<T>();
+    transformed_ddout_channel_ = transformed_ddout_channel.data<T>();
+
+    args1.idesc.set(transformed_ddout_channel, iwo_group);
+    args1.wdesc.set(filter, layout, iwo_group);
+    args1.odesc.set(transformed_ddx, iwo_group);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = search1::GetWorkspaceSize(args1);
+    bwd_result1.algo =
+        search1::Find<T>(args1, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search1 = SearchAlgorithm<ConvKind::kBackwardData>;
+    bwd_result1 = search1::Find<T>(dev_ctx, args1, false, deterministic, false);
+    workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
+#endif
+
+    ddfilter_ = ddfilter.data<T>();
+    args2.handle = handle;
+    args2.idesc.set(transformed_ddout_channel, iwo_group);
+    args2.wdesc.set(ddfilter, layout, iwo_group);
+    args2.odesc.set(transformed_x, iwo_group);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search2 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
+    bwd_result2.algo =
+        search2::Find<T>(args2, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search2 = SearchAlgorithm<ConvKind::kBackwardData>;
+    bwd_result2 = search2::Find<T>(dev_ctx, args2, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
+#endif
+  }
+
+  if (dfilter) {
+    dfilter_ = dfilter->data<T>();
+
+    args3.idesc.set(transformed_dout, iwo_group);
+    args3.wdesc.set(*dfilter, layout, iwo_group);
+    args3.odesc.set(transformed_ddx_channel, iwo_group);
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3));
+    filter_result.algo =
+        search3::Find<T>(args3, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search3 = SearchAlgorithm<ConvKind::kBackwardFilter>;
+    filter_result =
+        search3::Find<T>(dev_ctx, args3, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
+#endif
+  }
+
+  if (dx) {
+    transformed_dx_ = transformed_dx_channel.data<T>();
+
+    args4.handle = handle;
+    args4.idesc.set(transformed_dout, iwo_group);
+    args4.wdesc.set(ddfilter, layout, iwo_group);
+    args4.odesc.set(transformed_dx_channel, iwo_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations_,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
+#ifdef PADDLE_WITH_HIP
+    using search4 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
+    workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4));
+    fwd_result.algo =
+        search4::Find<T>(args4, false, deterministic, workspace_size, dev_ctx);
+#else
+    using search4 = SearchAlgorithm<ConvKind::kForward>;
+    fwd_result = search4::Find<T>(dev_ctx, args4, false, deterministic, false);
+    workspace_size = std::max(
+        workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
+#endif
+  }
+
+  int i_n, i_c, i_d, i_h, i_w;
+  GetNCDHW(transformed_x.dims(),
+           GPUDNNDataLayout::kNCHW,
+           &i_n,
+           &i_c,
+           &i_d,
+           &i_h,
+           &i_w);
+
+  int o_n, o_c, o_d, o_h, o_w;
+  GetNCDHW(transformed_dout.dims(),
+           GPUDNNDataLayout::kNCHW,
+           &o_n,
+           &o_c,
+           &o_d,
+           &o_h,
+           &o_w);
+
+  int group_offset_in =
+      transformed_x.numel() / transformed_x.dims()[0] / groups;
+  int group_offset_out =
+      transformed_dout.numel() / transformed_dout.dims()[0] / groups;
+  int group_offset_filter = filter.numel() / groups;
+
+  ScalingParamType<T> alpha = 1.0f;
+  ScalingParamType<T> beta = 0.0f;
+
+  // auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+  auto workspace_handle = GetDnnWorkspace(
+      const_cast<Allocator*>(&(dev_ctx.GetAllocator())), dev_ctx.stream());
+  if (ddout) {
+    ddx_ = transformed_ddx.data<T>();
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+                handle,
+                &alpha,
+                args1.odesc.desc(),
+                ddx_ + i * group_offset_in,
+                args1.wdesc.desc(),
+                filter_ + i * group_offset_filter,
+                args1.cdesc.desc(),
+                bwd_result1.algo,
+                &beta,
+                args1.idesc.desc(),
+                transformed_ddout_channel_ + i * group_offset_out,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardData>::Apply(dev_ctx,
+                                                  args1,
+                                                  bwd_result1,
+                                                  ddx_,
+                                                  filter_,
+                                                  transformed_ddout_channel_,
+                                                  groups,
+                                                  group_offset_out,
+                                                  group_offset_filter,
+                                                  group_offset_in,
+                                                  workspace_size,
+                                                  &workspace_handle,
+                                                  false);
+#endif  // PADDLE_WITH_HIP
+
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      // MIOPEN ONLY support beta to be 0.0f
+      DenseTensor conv_x_ddfilter(dout.type());
+      conv_x_ddfilter.Resize(transformed_ddout_channel.dims());
+      T* conv_x_ddfilter_data = dev_ctx.template Alloc<T>(&conv_x_ddfilter);
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionBackwardData(
+                handle,
+                &alpha,
+                args2.odesc.desc(),
+                x_ + i * group_offset_in,
+                args2.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args2.cdesc.desc(),
+                bwd_result2.algo,
+                &beta,
+                args2.idesc.desc(),
+                conv_x_ddfilter_data + i * group_offset_out,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenOpTensor(
+          handle,
+          miopenTensorOpAdd,
+          &alpha,
+          args2.idesc.desc(),
+          transformed_ddout_channel_ + i * group_offset_out,
+          &alpha,
+          args2.idesc.desc(),
+          conv_x_ddfilter_data + i * group_offset_out,
+          &beta,
+          args2.idesc.desc(),
+          transformed_ddout_channel_ + i * group_offset_out));
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardData>::Apply(dev_ctx,
+                                                  args2,
+                                                  bwd_result2,
+                                                  x_,
+                                                  ddfilter_,
+                                                  transformed_ddout_channel_,
+                                                  groups,
+                                                  group_offset_out,
+                                                  group_offset_filter,
+                                                  group_offset_in,
+                                                  workspace_size,
+                                                  &workspace_handle,
+                                                  true);
+#endif  // PADDLE_WITH_HIP
+
+    if ((!is_sys_pad) && (!channel_last)) {
+      if (strides.size() == 2U) {
+        funcs::Slice<Context, T, 4>(
+            dev_ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
+      } else if (!is_sys_pad && strides.size() == 3U) {
+        funcs::Slice<Context, T, 5>(
+            dev_ctx, &transformed_ddout_channel, ddout, starts, ends, axes);
+      }
+    } else if ((!is_sys_pad) && (channel_last)) {
+      if (strides.size() == 2U) {
+        funcs::Slice<Context, T, 4>(dev_ctx,
+                                    &transformed_ddout_channel,
+                                    &transformed_ddout_channel,
+                                    starts,
+                                    ends,
+                                    axes);
+      } else if (!is_sys_pad && strides.size() == 3U) {
+        funcs::Slice<Context, T, 5>(dev_ctx,
+                                    &transformed_ddout_channel,
+                                    &transformed_ddout_channel,
+                                    starts,
+                                    ends,
+                                    axes);
+      }
+
+      TransToChannelLast<Context, T>(
+          dev_ctx, &transformed_ddout_channel, ddout);
+    }
+  }
+
+  T* transformed_dout_channel_ = transformed_dout.data<T>();
+  if (dfilter) {
+    ddx_ = transformed_ddx_channel.data<T>();
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(
+                dynload::miopenConvolutionBackwardWeights(
+                    handle,
+                    &alpha,
+                    args3.odesc.desc(),
+                    ddx_ + i * group_offset_in,
+                    args3.idesc.desc(),
+                    transformed_dout_channel_ + i * group_offset_out,
+                    args3.cdesc.desc(),
+                    filter_result.algo,
+                    &beta,
+                    args3.wdesc.desc(),
+                    dfilter_ + i * group_offset_filter,
+                    workspace_ptr,
+                    workspace_size));
+          },
+          workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kBackwardFilter>::Apply(dev_ctx,
+                                                    args3,
+                                                    filter_result,
+                                                    ddx_,
+                                                    transformed_dout_channel_,
+                                                    dfilter_,
+                                                    groups,
+                                                    group_offset_out,
+                                                    group_offset_filter,
+                                                    group_offset_in,
+                                                    workspace_size,
+                                                    &workspace_handle,
+                                                    false);
+#endif  // PADDLE_WITH_HIP
+  }
+
+  if (dx) {
+    ddfilter_ = ddfilter.data<T>();
+#ifdef PADDLE_WITH_HIP
+    for (int i = 0; i < groups; i++) {
+      workspace_handle.RunFunc(
+          [&](void* workspace_ptr) {
+            PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenConvolutionForward(
+                handle,
+                &alpha,
+                args4.idesc.desc(),
+                transformed_dout_channel_ + i * group_offset_out,
+                args4.wdesc.desc(),
+                ddfilter_ + i * group_offset_filter,
+                args4.cdesc.desc(),
+                fwd_result.algo,
+                &beta,
+                args4.odesc.desc(),
+                transformed_dx_ + i * group_offset_in,
+                workspace_ptr,
+                workspace_size));
+          },
+          workspace_size);
+    }
+#else   // PADDLE_WITH_HIP
+    ConvRunner<T, ConvKind::kForward>::Apply(dev_ctx,
+                                             args4,
+                                             fwd_result,
+                                             transformed_dout_channel_,
+                                             ddfilter_,
+                                             transformed_dx_,
+                                             groups,
+                                             group_offset_out,
+                                             group_offset_filter,
+                                             group_offset_in,
+                                             workspace_size,
+                                             &workspace_handle,
+                                             false);
+#endif  // PADDLE_WITH_HIP
+
+    if (channel_last) {
+      TransToChannelLast<Context, T>(dev_ctx, &transformed_dx_channel, dx);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void Conv3dTransposeGradGPUDNNKernel(const Context& dev_ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& filter,
+                                     const DenseTensor& dout,
+                                     const std::vector<int>& strides,
+                                     const std::vector<int>& paddings_,
+                                     const std::vector<int>& output_padding,
+                                     const std::vector<int>& output_size,
+                                     const std::string& padding_algorithm,
+                                     int groups,
+                                     const std::vector<int>& dilations_,
+                                     const std::string& data_format,
+                                     DenseTensor* dx,
+                                     DenseTensor* dfilter) {
+  ConvTransposeGradRawGPUDNNKernel<T, Context>(dev_ctx,
+                                               x,
+                                               filter,
+                                               dout,
+                                               strides,
+                                               paddings_,
+                                               padding_algorithm,
+                                               groups,
+                                               dilations_,
+                                               data_format,
+                                               dx,
+                                               dfilter);
+}
+
+}  // namespace phi
+
+using float16 = phi::dtype::float16;
+
+PD_REGISTER_PLUGIN_KERNEL(conv2d_transpose_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::Conv2dTransposeGradGPUDNNKernel,
+                          float,
+                          double,
+                          float16,
+                          phi::dtype::bfloat16) {}
+PD_REGISTER_PLUGIN_KERNEL(conv2d_transpose_double_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::Conv2dTransposeDoubleGradGPUDNNKernel,
+                          float,
+                          double,
+                          float16,
+                          phi::dtype::bfloat16) {}
+PD_REGISTER_PLUGIN_KERNEL(conv3d_transpose_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::Conv3dTransposeGradGPUDNNKernel,
+                          float,
+                          double,
+                          float16,
+                          phi::dtype::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h
deleted file mode 100644
index 03651be95c3..00000000000
--- a/backends/metax_gpu/kernels/impl/spectral_norm_grad_kernel_impl.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "kernels/impl/spectral_norm_kernel_impl.h"
-
-namespace phi {
-
-template <typename T, typename Context>
-void SpectralNormGradKernel(const Context& dev_ctx,
-                            const DenseTensor& weight,
-                            const DenseTensor& u,
-                            const DenseTensor& v,
-                            const DenseTensor& out_grad,
-                            int dim,
-                            int power_iters,
-                            float eps,
-                            DenseTensor* weight_grad) {
-  auto& place = *dev_ctx.eigen_device();
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  const int h = u.dims()[0];
-  const int w = v.dims()[0];
-
-  DenseTensor weight_mat, out_grad_mat;
-  auto dims = weight.dims();
-  const int rank = dims.size();
-  std::vector<int> real_dims;
-  if (dim != 0) {
-    std::vector<int> perm;
-    perm.push_back(dim);
-    real_dims.push_back(dims[dim]);
-    for (int i = 0; i < rank; i++) {
-      if (i != dim) {
-        perm.push_back(i);
-        real_dims.push_back(dims[i]);
-      }
-    }
-    weight_mat.Resize(common::make_ddim(real_dims));
-    dev_ctx.template Alloc<T>(&weight_mat);
-    out_grad_mat.Resize(common::make_ddim(real_dims));
-    dev_ctx.template Alloc<T>(&out_grad_mat);
-    TransCompute2DTo5D<Context, T>(dev_ctx, weight, rank, perm, &weight_mat);
-    TransCompute2DTo5D<Context, T>(
-        dev_ctx, out_grad, rank, perm, &out_grad_mat);
-  } else {
-    for (int i = 0; i < rank; i++) {
-      real_dims.push_back(i);
-    }
-    phi::Copy(dev_ctx, weight, dev_ctx.GetPlace(), true, &weight_mat);
-    phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), true, &out_grad_mat);
-  }
-  weight_mat = weight_mat.Resize({h, w});
-  out_grad_mat = out_grad_mat.Resize({h, w});
-
-  DenseTensor sigma;
-  sigma.Resize(weight_mat.dims());
-  dev_ctx.template Alloc<T>(&sigma);
-  DenseTensor uu, vv;
-  phi::Copy(dev_ctx, u, dev_ctx.GetPlace(), true, &uu);
-  phi::Copy(dev_ctx, v, dev_ctx.GetPlace(), true, &vv);
-  CalcMatrixSigmaAndNormWeight<Context, T>(dev_ctx,
-                                           &weight_mat,
-                                           &(uu.Resize({h, 1})),
-                                           &(vv.Resize({w, 1})),
-                                           &sigma,
-                                           power_iters,
-                                           eps);
-
-  DenseTensor uv;
-  uv.Resize({h, w});
-  dev_ctx.template Alloc<T>(&uv);
-  blas.MatMul(
-      uu.Resize({h, 1}), false, vv.Resize({w, 1}), false, T(1), &uv, T(0));
-
-  DenseTensor weight_grad_mat;
-  weight_grad_mat.Resize({h, w});
-  dev_ctx.template Alloc<T>(&weight_grad_mat);
-  auto weight_grad_mat_t = EigenTensor<T, 2>::From(weight_grad_mat);
-  auto weight_mat_t = EigenTensor<T, 2>::From(weight_mat);
-  auto out_grad_mat_t = EigenTensor<T, 2>::From(out_grad_mat);
-  auto sigma_t = EigenTensor<T, 2>::From(sigma);
-  auto uv_t = EigenTensor<T, 2>::From(uv);
-  weight_mat_t.device(place) =
-      weight_mat_t.sum().eval().reshape(Array2(1, 1)).broadcast(Array2(h, w));
-  weight_grad_mat_t.device(place) =
-      out_grad_mat_t * (out_grad_mat_t.constant(1.0) - uv_t * weight_mat_t) /
-      sigma_t;
-
-  if (dim != 0) {
-    std::vector<int> perm;
-    for (int i = 0; i < rank; i++) {
-      if (i < dim) {
-        perm.push_back(i + 1);
-      } else if (i == dim) {
-        perm.push_back(0);
-      } else {
-        perm.push_back(i);
-      }
-    }
-    weight_grad->Resize(dims);
-    dev_ctx.template Alloc<T>(weight_grad);
-    TransCompute2DTo5D<Context, T>(
-        dev_ctx,
-        weight_grad_mat.Resize(common::make_ddim(real_dims)),
-        rank,
-        perm,
-        weight_grad);
-  } else {
-    phi::Copy(dev_ctx,
-              weight_grad_mat.Resize(dims),
-              dev_ctx.GetPlace(),
-              true,
-              weight_grad);
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h b/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h
deleted file mode 100644
index 8c9fc548259..00000000000
--- a/backends/metax_gpu/kernels/impl/spectral_norm_kernel_impl.h
+++ /dev/null
@@ -1,182 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "kernels/funcs/blas/blas.h"
-#include "paddle/phi/core/tensor_utils.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace phi {
-
-using Array1 = Eigen::DSizes<int64_t, 1>;
-using Array2 = Eigen::DSizes<int64_t, 2>;
-using IndexPair = Eigen::IndexPair<int>;
-
-template <typename Context, typename T>
-static inline void TransCompute2DTo5D(const Context& dev_ctx,
-                                      const DenseTensor& in,
-                                      const int rank,
-                                      const std::vector<int>& perm,
-                                      DenseTensor* out) {
-  if (rank <= 1 || rank > 5) {
-    PADDLE_THROW(common::errors::Fatal(
-        "Weight rank of SpectralNorm should be in range [2, 5], but got %d.",
-        rank));
-  }
-
-  switch (rank) {
-    case 2:
-      phi::funcs::Transpose<Context, T, 2> trans2;
-      trans2(dev_ctx, in, out, perm);
-      break;
-    case 3:
-      phi::funcs::Transpose<Context, T, 3> trans3;
-      trans3(dev_ctx, in, out, perm);
-      break;
-    case 4:
-      phi::funcs::Transpose<Context, T, 4> trans4;
-      trans4(dev_ctx, in, out, perm);
-      break;
-    case 5:
-      phi::funcs::Transpose<Context, T, 5> trans5;
-      trans5(dev_ctx, in, out, perm);
-      break;
-    default:
-      break;
-  }
-}
-
-template <typename Context, typename T>
-static inline void CalcMatrixSigmaAndNormWeight(const Context& dev_ctx,
-                                                DenseTensor* weight,
-                                                DenseTensor* u,
-                                                DenseTensor* v,
-                                                DenseTensor* sigma,
-                                                const int power_iters,
-                                                const float eps) {
-  auto& place = *dev_ctx.eigen_device();
-  auto blas = funcs::GetBlas<Context, T>(dev_ctx);
-  auto sigma_t = EigenTensor<T, 2>::From(*sigma);
-  auto weight_t = EigenTensor<T, 2>::From(*weight);
-  auto u_t = EigenTensor<T, 2>::From(*u);
-  auto v_t = EigenTensor<T, 2>::From(*v);
-
-  const int h = weight->dims()[0];
-  const int w = weight->dims()[1];
-
-  for (int i = 0; i < power_iters; i++) {
-    // V = W^T * U / ||W^T * U||_2
-    blas.MatMul(*weight, true, *u, false, T(1), v, T(0));
-    auto v_t_norm =
-        v_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast(
-            Array1(w));
-    v_t.device(place) = v_t / (v_t_norm + v_t_norm.constant(eps));
-    // U = W^T * V / ||W^T * V||_2
-    blas.MatMul(*weight, false, *v, false, T(1), u, T(0));
-    auto u_t_norm =
-        u_t.square().sum().sqrt().eval().reshape(Array1(1)).broadcast(
-            Array1(h));
-    u_t.device(place) = u_t / (u_t_norm + u_t_norm.constant(eps));
-  }
-  DenseTensor weight_v;
-  weight_v.Resize({h, 1});
-  dev_ctx.template Alloc<T>(&weight_v);
-  blas.MatMul(*weight, false, *v, false, T(1), &weight_v, T(0));
-  auto weight_v_t = EigenTensor<T, 2>::From(weight_v);
-  sigma_t.device(place) = (u_t * weight_v_t)
-                              .sum()
-                              .eval()
-                              .reshape(Array2(1, 1))
-                              .broadcast(Array2(h, w));
-  weight_t.device(place) = weight_t / sigma_t;
-}
-
-template <typename T, typename Context>
-void SpectralNormKernel(const Context& dev_ctx,
-                        const DenseTensor& weight,
-                        const DenseTensor& u,
-                        const DenseTensor& v,
-                        int dim,
-                        int power_iters,
-                        float eps,
-                        DenseTensor* out) {
-  const int h = u.dims()[0];
-  const int w = v.dims()[0];
-
-  DenseTensor weight_mat;
-  auto dims = weight.dims();
-  const int rank = dims.size();
-  std::vector<int> real_dims;
-  if (dim != 0) {
-    std::vector<int> perm;
-    perm.push_back(dim);
-    real_dims.push_back(dims[dim]);
-    for (int i = 0; i < rank; i++) {
-      if (i != dim) {
-        perm.push_back(i);
-        real_dims.push_back(dims[i]);
-      }
-    }
-    weight_mat.Resize(common::make_ddim(real_dims));
-    dev_ctx.template Alloc<T>(&weight_mat);
-    TransCompute2DTo5D<Context, T>(dev_ctx, weight, rank, perm, &weight_mat);
-  } else {
-    for (int i = 0; i < rank; i++) {
-      real_dims.push_back(i);
-    }
-    phi::Copy(dev_ctx, weight, dev_ctx.GetPlace(), true, &weight_mat);
-  }
-  weight_mat = weight_mat.Resize({h, w});
-
-  DenseTensor sigma;
-  sigma.Resize(weight_mat.dims());
-  dev_ctx.template Alloc<T>(&sigma);
-  DenseTensor uu, vv;
-  phi::Copy(dev_ctx, u, dev_ctx.GetPlace(), true, &uu);
-  phi::Copy(dev_ctx, v, dev_ctx.GetPlace(), true, &vv);
-  CalcMatrixSigmaAndNormWeight<Context, T>(dev_ctx,
-                                           &weight_mat,
-                                           &(uu.Resize({h, 1})),
-                                           &(vv.Resize({w, 1})),
-                                           &sigma,
-                                           power_iters,
-                                           eps);
-
-  if (dim != 0) {
-    std::vector<int> perm;
-    for (int i = 0; i < rank; i++) {
-      if (i < dim) {
-        perm.push_back(i + 1);
-      } else if (i == dim) {
-        perm.push_back(0);
-      } else {
-        perm.push_back(i);
-      }
-    }
-    out->Resize(dims);
-    dev_ctx.template Alloc<T>(out);
-    TransCompute2DTo5D<Context, T>(
-        dev_ctx,
-        weight_mat.Resize(common::make_ddim(real_dims)),
-        rank,
-        perm,
-        out);
-  } else {
-    phi::Copy(dev_ctx, weight_mat.Resize(dims), dev_ctx.GetPlace(), true, out);
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_context.cc
index 9bd26a170c5..4df4d88b0b4 100644
--- a/backends/metax_gpu/kernels/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_context.cc
@@ -15,6 +15,7 @@
 #include "kernels/metax_context.h"
 
 namespace phi {
+bool AllowTF32Cudnn() { return false; }
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_context.h
index 21e9084a977..5974aadcc41 100644
--- a/backends/metax_gpu/kernels/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_context.h
@@ -128,6 +128,7 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle,
   }
 }
 
+bool AllowTF32Cudnn();
 inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) {
   std::call_once(flag_cusolver_dn_, [&]() {
     if (!cusolver_dn_handle_) {
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
new file mode 100644
index 00000000000..d7540d949a9
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
@@ -0,0 +1,650 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_context.h"
+#include "paddle/common/layout.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+#include "paddle/phi/kernels/instance_norm_grad_kernel.h"
+
+namespace phi {
+template <typename T, int BlockDim>
+static __global__ void GradComputeDX(const T *dy,
+                                     const BatchNormParamType<T> *scale,
+                                     const BatchNormParamType<T> *mean,
+                                     const T *x,
+                                     const BatchNormParamType<T> *variance,
+                                     const int C,
+                                     const int sample_size,
+                                     T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  BatchNormParamType<T> mean_val = mean[ncid];
+  BatchNormParamType<T> inv_var_val = variance[ncid];
+  typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage;
+  __shared__ BatchNormParamType<T> dy_sum_val;
+  __shared__ BatchNormParamType<T> dy_x_sub_mean_sum_val;
+  BatchNormParamType<T> dy_sum = static_cast<BatchNormParamType<T>>(0);
+  BatchNormParamType<T> dy_x_sub_mean_sum =
+      static_cast<BatchNormParamType<T>>(0);
+
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    BatchNormParamType<T> dy_i = static_cast<BatchNormParamType<T>>(dy[i]);
+    dy_sum += dy_i;
+    dy_x_sub_mean_sum +=
+        dy_i * (static_cast<BatchNormParamType<T>>(x[i]) - mean_val);
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_x_sub_mean_sum =
+      BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_x_sub_mean_sum_val = dy_x_sub_mean_sum;
+  }
+  __syncthreads();
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    dx[i] = static_cast<T>(
+        (static_cast<BatchNormParamType<T>>(dy[i]) -
+         dy_sum_val / static_cast<BatchNormParamType<T>>(sample_size) -
+         (static_cast<BatchNormParamType<T>>(x[i]) - mean_val) *
+             dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) *
+        scale[c] * inv_var_val);
+  }
+}
+
+static __device__ __forceinline__ float real_sqrt(float x) {
+  return 1. / sqrtf(x);
+}
+static __device__ __forceinline__ double real_sqrt(double x) {
+  return 1. / sqrt(x);
+}
+
+template <typename T, typename AccT, int BlockDim>
+__global__ void DoubleGradComputeDX(const T *x,
+                                    const AccT *mean,
+                                    const AccT *variance,
+                                    const T *ddx,
+                                    const T *dy,
+                                    const AccT *scale,
+                                    const AccT *ddscale,
+                                    int C,
+                                    int sample_size,
+                                    const double epsilon,
+                                    T *dx) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
+
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ AccT dy_sum_val;
+  __shared__ AccT ddx_sum_val;
+  __shared__ AccT dy_mul_ddx_sum_val;
+  __shared__ AccT dy_mul_x_sub_mean_sum_val;
+  __shared__ AccT ddx_mul_x_sub_mean_sum_val;
+
+  AccT dy_sum = 0;
+  AccT ddx_sum = 0;
+  AccT dy_mul_ddx_sum = 0;
+  AccT dy_mul_x_sub_mean_sum = 0;
+  AccT ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    AccT ddx_i = static_cast<AccT>(ddx[i]);
+    AccT dy_i = static_cast<AccT>(dy[i]);
+    AccT tmp = static_cast<AccT>(x[i]) - mean_val;
+
+    dy_sum += dy_i;
+    ddx_sum += ddx_i;
+    dy_mul_ddx_sum += (ddx_i * dy_i);
+
+    dy_mul_x_sub_mean_sum += (dy_i * tmp);
+    ddx_mul_x_sub_mean_sum += (ddx_i * tmp);
+  }
+
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  dy_mul_ddx_sum =
+      BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    ddx_sum_val = ddx_sum;
+    dy_mul_ddx_sum_val = dy_mul_ddx_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(dx[i]);
+      tmp +=
+          ((static_cast<AccT>(x[i]) - mean_val) * var_val * var_val * var_val /
+               sample_size *
+               (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val +
+                3. * dy_mul_x_sub_mean_sum_val * var_val *
+                    ddx_mul_x_sub_mean_sum_val * var_val / sample_size) +
+           ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val * (dy_sum_val / sample_size - static_cast<AccT>(dy[i])) +
+           dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val *
+               var_val *
+               (ddx_sum_val / sample_size - static_cast<AccT>(ddx[i]))) *
+          scale[c];
+      dx[i] = static_cast<T>(tmp);
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(dx[i]);
+      tmp += (static_cast<AccT>(dy[i]) * var_val -
+              dy_sum_val / sample_size * var_val -
+              (static_cast<AccT>(x[i]) - mean_val) * var_val *
+                  dy_mul_x_sub_mean_sum_val * var_val / sample_size) *
+             ddscale[c];
+      dx[i] = static_cast<T>(tmp);
+    }
+  }
+}
+
+template <typename T, typename AccT, int BlockDim>
+__global__ void DoubleGradComputeDDY(const T *x,
+                                     const AccT *mean,
+                                     const AccT *variance,
+                                     const AccT *ddscale,
+                                     const AccT *ddbias,
+                                     const T *ddx,
+                                     const AccT *scale,
+                                     int C,
+                                     int sample_size,
+                                     const double epsilon,
+                                     T *ddy) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ddx_storage;
+  __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage;
+  __shared__ AccT ddx_sum_val;
+  __shared__ AccT ddx_mul_x_sub_mean_sum_val;
+
+  AccT ddx_sum = 0;
+  AccT ddx_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    AccT ddx_i = static_cast<AccT>(ddx[i]);
+    ddx_sum += ddx_i;
+    ddx_mul_x_sub_mean_sum += (ddx_i * (static_cast<AccT>(x[i]) - mean_val));
+  }
+  ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum());
+  ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage)
+                               .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum());
+  if (threadIdx.x == 0) {
+    ddx_sum_val = ddx_sum;
+    ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(ddy[i]);
+      tmp += scale[c] * var_val *
+             (static_cast<AccT>(ddx[i]) - ddx_sum_val / sample_size -
+              (static_cast<AccT>(x[i]) - mean_val) * var_val *
+                  ddx_mul_x_sub_mean_sum_val * var_val / sample_size);
+      ddy[i] = static_cast<T>(tmp);
+    }
+  }
+  __syncthreads();
+  if (ddscale != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      AccT tmp = static_cast<AccT>(ddy[i]);
+      tmp += (static_cast<AccT>(x[i]) - mean_val) * var_val * ddscale[c];
+      ddy[i] = static_cast<T>(tmp);
+    }
+  }
+  __syncthreads();
+  if (ddbias != nullptr) {
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      ddy[i] = static_cast<T>(static_cast<AccT>(ddy[i]) + ddbias[c]);
+    }
+  }
+}
+
+template <typename T, typename AccT, int BlockDim>
+__global__ void DoubleGradComputeDScale(const T *x,
+                                        const AccT *mean,
+                                        const AccT *variance,
+                                        const T *ddx,
+                                        const T *dy,
+                                        int C,
+                                        int sample_size,
+                                        const double epsilon,
+                                        AccT *dscale) {
+  int beg_idx = blockIdx.x * sample_size + threadIdx.x;
+  int end_idx = (blockIdx.x + 1) * sample_size;
+  int ncid = blockIdx.x;
+  int c = ncid % C;
+  AccT mean_val = mean[ncid];
+  AccT var_val = variance[ncid];
+  typedef cub::BlockReduce<AccT, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage dy_storage;
+  __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage;
+  __shared__ typename BlockReduce::TempStorage dscale_tmp_storage;
+  __shared__ AccT dy_sum_val;
+  __shared__ AccT dy_mul_x_sub_mean_sum_val;
+
+  AccT dy_sum = 0;
+  AccT dy_mul_x_sub_mean_sum = 0;
+  for (int i = beg_idx; i < end_idx; i += BlockDim) {
+    AccT dy_i = static_cast<AccT>(dy[i]);
+    dy_sum += dy_i;
+    dy_mul_x_sub_mean_sum += (dy_i * (static_cast<AccT>(x[i]) - mean_val));
+  }
+  dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum());
+  dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage)
+                              .Reduce(dy_mul_x_sub_mean_sum, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    dy_sum_val = dy_sum;
+    dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum;
+  }
+  __syncthreads();
+  if (ddx != nullptr) {
+    AccT dscale_tmp = 0;
+    for (int i = beg_idx; i < end_idx; i += BlockDim) {
+      dscale_tmp +=
+          static_cast<AccT>(ddx[i]) * var_val *
+          (static_cast<AccT>(dy[i]) - dy_sum_val / sample_size -
+           dy_mul_x_sub_mean_sum_val * (static_cast<AccT>(x[i]) - mean_val) *
+               var_val * var_val / sample_size);
+    }
+    dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum());
+    if (threadIdx.x == 0) {
+      dscale[ncid] += dscale_tmp;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, typename Context>
+void InstanceNormGradKernel(const Context &dev_ctx,
+                            const DenseTensor &x,
+                            const paddle::optional<DenseTensor> &scale,
+                            const paddle::optional<DenseTensor> &bias UNUSED,
+                            const DenseTensor &saved_mean,
+                            const DenseTensor &saved_variance,
+                            const DenseTensor &d_y,
+                            float epsilon_f,
+                            DenseTensor *d_x,
+                            DenseTensor *d_scale,
+                            DenseTensor *d_bias) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  double epsilon = static_cast<double>(epsilon_f);
+  const auto *scale_ptr = scale.get_ptr();
+
+  const auto &x_dims = x.dims();
+
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+
+  DenseTensor x_tmp, d_y_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  d_y_tmp.ShareDataWith(d_y).Resize({1, NxC, H, W, D});
+
+  phi::funcs::SetConstant<GPUContext, AccT> set_constant;
+
+  dev_ctx.template Alloc<T>(d_x);
+  if (x.numel() == 0) {
+    if (d_scale) {
+      dev_ctx.template Alloc<AccT>(d_scale);
+      set_constant(dev_ctx, d_scale, static_cast<AccT>(0));
+    }
+    if (d_bias) {
+      dev_ctx.template Alloc<AccT>(d_bias);
+      set_constant(dev_ctx, d_bias, static_cast<AccT>(0));
+    }
+    return;
+  }
+  if (d_scale && d_bias) {
+    dev_ctx.template Alloc<AccT>(d_scale);
+    dev_ctx.template Alloc<AccT>(d_bias);
+  }
+
+  if (scale_ptr) {
+    PADDLE_ENFORCE_EQ(
+        scale_ptr->dims().size(),
+        1UL,
+        common::errors::InvalidArgument(
+            "The `shape` in InstanceNormOp is invalid: "
+            "the size of scale's dimensions must be equal to 1. But "
+            "received: the size of scale's dimensions"
+            "is [%d]",
+            scale_ptr->dims().size()));
+    PADDLE_ENFORCE_EQ(scale_ptr->dims()[0],
+                      C,
+                      common::errors::InvalidArgument(
+                          "The `shape` in InstanceNormOp is invalid: "
+                          "the first dimension of scale must be equal to "
+                          "Channels([%d]). But received: "
+                          "the first dimension of scale is [%d],"
+                          "the dimensions of scale is [%s], ",
+                          C,
+                          scale_ptr->dims()[0],
+                          scale_ptr->dims()));
+  }
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min(NxC, max_blocks);
+  const int grid1 = (C + block - 1) / block;
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&scale_tmp);
+
+  DenseTensor d_scale_tmp;
+  d_scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&d_scale_tmp);
+
+  DenseTensor d_bias_tmp;
+  d_bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&d_bias_tmp);
+  if (scale_ptr) {
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<AccT>(), scale_tmp.data<AccT>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<AccT>(1));
+  }
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      const_cast<int *>(dims.data()),
+      const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      dims.data(),
+      strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+  const auto *saved_mean_data =
+      saved_mean.template data<BatchNormParamType<T>>();
+  const auto *saved_var_data =
+      saved_variance.template data<BatchNormParamType<T>>();
+
+  if (d_scale && d_bias) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenBatchNormalizationBackward(
+        GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+        miopenBNSpatial,
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        data_desc_,
+        x_tmp.template data<T>(),
+        data_desc_,
+        d_y_tmp.template data<T>(),
+        data_desc_,
+        d_x->template data<T>(),
+        in_param_desc_,
+        scale_tmp.template data<BatchNormParamType<T>>(),
+        d_scale_tmp.template data<BatchNormParamType<T>>(),
+        d_bias_tmp.template data<BatchNormParamType<T>>(),
+        epsilon,
+        saved_mean_data,
+        saved_var_data));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnBatchNormalizationBackward(
+        GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+        CUDNN_BATCHNORM_SPATIAL,
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(),
+        data_desc_,
+        x_tmp.template data<T>(),
+        data_desc_,
+        d_y_tmp.template data<T>(),
+        data_desc_,
+        d_x->template data<T>(),
+        in_param_desc_,
+        scale_tmp.template data<BatchNormParamType<T>>(),
+        d_scale_tmp.template data<BatchNormParamType<T>>(),
+        d_bias_tmp.template data<BatchNormParamType<T>>(),
+        epsilon,
+        saved_mean_data,
+        saved_var_data));
+#endif
+  } else {
+    if (d_x) {
+      GradComputeDX<T, block><<<NxC, block, 0, dev_ctx.stream()>>>(
+          d_y.data<T>(),
+          scale_tmp.data<BatchNormParamType<T>>(),
+          saved_mean_data,
+          x.data<T>(),
+          saved_var_data,
+          C,
+          H * W * D,
+          d_x->data<T>());
+    }
+  }
+  if (d_scale && d_bias) {
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_scale_tmp.data<AccT>(), d_scale->data<AccT>(), N, C);
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        d_bias_tmp.data<AccT>(), d_bias->data<AccT>(), N, C);
+  }
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+
+template <typename T, typename Context>
+void InstanceNormDoubleGradKernel(const Context &dev_ctx,
+                                  const DenseTensor &x,
+                                  const paddle::optional<DenseTensor> &scale,
+                                  const DenseTensor &saved_mean,
+                                  const DenseTensor &saved_variance,
+                                  const DenseTensor &dy,
+                                  const paddle::optional<DenseTensor> &ddx,
+                                  const paddle::optional<DenseTensor> &ddscale,
+                                  const paddle::optional<DenseTensor> &ddbias,
+                                  float epsilon_f,
+                                  DenseTensor *dx,
+                                  DenseTensor *dscale,
+                                  DenseTensor *ddy) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  const auto *Scale = scale.get_ptr();
+  const auto *ddX = ddx.get_ptr();
+  const auto *ddScale = ddscale.get_ptr();
+  const auto *ddBias = ddbias.get_ptr();
+  const double epsilon = static_cast<double>(epsilon_f);
+  const T *x_data = x.data<T>();
+  const T *dy_data = dy.data<T>();
+  const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data<T>());
+  const AccT *ddscale_data =
+      (ddScale == nullptr ? nullptr : ddScale->data<AccT>());
+  const AccT *ddbias_data =
+      (ddScale == nullptr ? nullptr : ddBias->data<AccT>());
+  const AccT *mean_data = saved_mean.data<AccT>();
+  const AccT *variance_data = saved_variance.data<AccT>();
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  phi::funcs::SetConstant<GPUContext, AccT> set_zero_AccT;
+
+  auto &x_dims = x.dims();
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  const int n = x.numel();
+  int sample_size = n / N / C;
+
+  DenseTensor scale_tmp;
+  if (!Scale) {
+    scale_tmp.Resize({C});
+    dev_ctx.template Alloc<AccT>(&scale_tmp);
+    set_zero_AccT(dev_ctx, &scale_tmp, static_cast<AccT>(1));
+  }
+  const AccT *scale_data = Scale ? Scale->data<AccT>() : scale_tmp.data<AccT>();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = NxC;
+  const int grid1 = (C + block - 1) / block;
+
+  if (dx) {
+    T *dx_data = dev_ctx.template Alloc<T>(dx);
+    set_zero(dev_ctx, dx, static_cast<T>(0));
+    DoubleGradComputeDX<T, AccT, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddx_data,
+                                               dy_data,
+                                               scale_data,
+                                               ddscale_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               dx_data);
+  }
+  if (dscale) {
+    DenseTensor dscale_tmp;
+    dscale_tmp.Resize({NxC});
+    dev_ctx.template Alloc<AccT>(&dscale_tmp);
+    set_zero_AccT(dev_ctx, &dscale_tmp, static_cast<AccT>(0));
+    AccT *dscale_tmp_data = dscale_tmp.data<AccT>();
+
+    AccT *dscale_data = dev_ctx.template Alloc<AccT>(dscale);
+    set_zero_AccT(dev_ctx, dscale, static_cast<AccT>(0));
+    DoubleGradComputeDScale<T, AccT, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddx_data,
+                                               dy_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               dscale_tmp_data);
+    add_param<AccT, block, false><<<grid1, block, 0, dev_ctx.stream()>>>(
+        dscale_tmp.data<AccT>(), dscale->data<AccT>(), N, C);
+  }
+  if (ddy) {
+    T *ddy_data = dev_ctx.template Alloc<T>(ddy);
+    set_zero(dev_ctx, ddy, static_cast<T>(0));
+    DoubleGradComputeDDY<T, AccT, block>
+        <<<grid, block, 0, dev_ctx.stream()>>>(x_data,
+                                               mean_data,
+                                               variance_data,
+                                               ddscale_data,
+                                               ddbias_data,
+                                               ddx_data,
+                                               scale_data,
+                                               C,
+                                               sample_size,
+                                               epsilon,
+                                               ddy_data);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(instance_norm_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::InstanceNormGradKernel,
+                          float,
+                          double,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16) {}
+PD_REGISTER_PLUGIN_KERNEL(instance_norm_double_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::InstanceNormDoubleGradKernel,
+                          float,
+                          double,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
new file mode 100644
index 00000000000..db975d74665
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
@@ -0,0 +1,253 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_context.h"
+#include "paddle/common/layout.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+#include "paddle/phi/kernels/gpu/instance_norm_utils.h"
+#include "paddle/phi/kernels/instance_norm_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void InstanceNormKernel(const Context &dev_ctx,
+                        const DenseTensor &x,
+                        const paddle::optional<DenseTensor> &scale,
+                        const paddle::optional<DenseTensor> &bias,
+                        float epsilon_f,
+                        DenseTensor *y,
+                        DenseTensor *saved_mean,
+                        DenseTensor *saved_variance) {
+  using AccT = typename phi::dtype::MPTypeTrait<T>::Type;
+  double epsilon = static_cast<double>(epsilon_f);
+  auto &x_dims = x.dims();
+  PADDLE_ENFORCE_GE(x_dims.size(),
+                    2,
+                    common::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must greater than "
+                        "or equal to 2. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    5,
+                    common::errors::InvalidArgument(
+                        "The `shape` in InstanceNormOp is invalid: "
+                        "the size of X's dimensions must smaller than"
+                        "or equal to 5. But received: "
+                        "the size of X's dimensions is [%d]",
+                        x_dims.size()));
+  int N, C, H, W, D;
+  funcs::ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D);
+  int NxC = N * C;
+  DenseTensor x_tmp;
+  x_tmp.ShareDataWith(x).Resize({1, NxC, H, W, D});
+  dev_ctx.template Alloc<T>(y);
+  phi::funcs::SetConstant<GPUContext, BatchNormParamType<T>> functor;
+  phi::funcs::SetConstant<GPUContext, T> functor_y;
+  if (x.numel() == 0) {
+    functor_y(dev_ctx, y, static_cast<T>(0));
+    if (saved_mean) {
+      dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+      functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+    }
+    if (saved_variance) {
+      dev_ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+      functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+    }
+    return;
+  }
+
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t data_desc_;
+  miopenTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenCreateTensorDescriptor(&in_param_desc_));
+#else
+  cudnnTensorDescriptor_t data_desc_;
+  cudnnTensorDescriptor_t in_param_desc_;
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnCreateTensorDescriptor(&in_param_desc_));
+#endif
+  if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+    LOG(ERROR) << "Provided epsilon is smaller than "
+               << "CUDNN_BN_MIN_EPSILON. Setting it to "
+               << "CUDNN_BN_MIN_EPSILON instead.";
+  }
+  epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+
+  VLOG(3) << "Setting descriptors.";
+  std::vector<int> dims;
+  std::vector<int> strides;
+  dims = {1, NxC, H, W, D};
+  strides = {NxC * H * W * D, H * W * D, W * D, D, 1};
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetTensorDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      const_cast<int *>(dims.data()),
+      const_cast<int *>(strides.data())));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, miopenBNSpatial));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetTensorNdDescriptor(
+      data_desc_,
+      CudnnDataType<T>::type,
+      x_dims.size() > 3 ? x_dims.size() : 4,
+      dims.data(),
+      strides.data()));
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDeriveBNTensorDescriptor(
+      in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL));
+#endif
+
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  DenseTensor scale_tmp;
+  scale_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&scale_tmp);
+  DenseTensor bias_tmp;
+  bias_tmp.Resize({NxC});
+  dev_ctx.template Alloc<AccT>(&bias_tmp);
+
+  const int n = x.numel();
+  const int block = 512;
+  int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
+  const int max_blocks = std::max(max_threads / block, 1);
+  const int grid = std::min((NxC + block - 1) / block, max_blocks);
+
+  phi::funcs::SetConstant<GPUContext, AccT> set_constant;
+  if (scale_ptr) {
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        scale_ptr->data<AccT>(), scale_tmp.data<AccT>(), N, C);
+  } else {
+    set_constant(dev_ctx, &scale_tmp, static_cast<AccT>(1));
+  }
+  if (bias_ptr) {
+    repeat_param<AccT><<<grid, block, 0, dev_ctx.stream()>>>(
+        bias_ptr->data<AccT>(), bias_tmp.data<AccT>(), N, C);
+  } else {
+    set_constant(dev_ctx, &bias_tmp, static_cast<AccT>(0));
+  }
+
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  DenseTensor saved_mean_tmp, saved_variance_tmp;
+
+  if (saved_mean) {
+    dev_ctx.template Alloc<BatchNormParamType<T>>(saved_mean);
+    functor(dev_ctx, saved_mean, static_cast<BatchNormParamType<T>>(0));
+  } else {
+    saved_mean_tmp = phi::Full<BatchNormParamType<T>>(
+        dev_ctx, {NxC}, static_cast<BatchNormParamType<T>>(0));
+  }
+  if (saved_variance) {
+    dev_ctx.template Alloc<BatchNormParamType<T>>(saved_variance);
+    functor(dev_ctx, saved_variance, static_cast<BatchNormParamType<T>>(0));
+  } else {
+    saved_variance_tmp = phi::Full<BatchNormParamType<T>>(
+        dev_ctx, {NxC}, static_cast<BatchNormParamType<T>>(0));
+  }
+  auto *saved_mean_data = saved_mean
+                              ? saved_mean->data<BatchNormParamType<T>>()
+                              : saved_mean_tmp.data<BatchNormParamType<T>>();
+  auto *saved_variance_data =
+      saved_variance ? saved_variance->data<BatchNormParamType<T>>()
+                     : saved_variance_tmp.data<BatchNormParamType<T>>();
+
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenBatchNormalizationForwardTraining(
+          handle,
+          miopenBNSpatial,
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kOne())),
+          const_cast<void *>(
+              static_cast<const void *>(CudnnDataType<T>::kZero())),
+          data_desc_,
+          static_cast<const void *>(x_tmp.template data<T>()),
+          data_desc_,
+          static_cast<void *>(y->template data<T>()),
+          in_param_desc_,
+          const_cast<void *>(static_cast<const void *>(
+              scale_tmp.template data<BatchNormParamType<T>>())),
+          const_cast<void *>(static_cast<const void *>(
+              bias_tmp.template data<BatchNormParamType<T>>())),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          static_cast<void *>(saved_mean_data),
+          static_cast<void *>(saved_variance_data)));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenDestroyTensorDescriptor(in_param_desc_));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnBatchNormalizationForwardTraining(
+          handle,
+          CUDNN_BATCHNORM_SPATIAL,
+          CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(),
+          data_desc_,
+          x_tmp.template data<T>(),
+          data_desc_,
+          y->template data<T>(),
+          in_param_desc_,
+          scale_tmp.template data<BatchNormParamType<T>>(),
+          bias_tmp.template data<BatchNormParamType<T>>(),
+          0,
+          nullptr,
+          nullptr,
+          epsilon,
+          saved_mean_data,
+          saved_variance_data));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnDestroyTensorDescriptor(in_param_desc_));
+#endif
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(instance_norm,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::InstanceNormKernel,
+                          float,
+                          double,
+                          phi::dtype::float16,
+                          phi::dtype::bfloat16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16 ||
+      kernel_key.dtype() == phi::DataType::BFLOAT16) {
+    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
diff --git a/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu
new file mode 100644
index 00000000000..f99621f8ab9
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_grad_kernel_register.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/spectral_norm_grad_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(spectral_norm_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::SpectralNormGradKernel,
+                          float,
+                          double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu
new file mode 100644
index 00000000000..466937f993b
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/spectral_norm_kernel_register.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/spectral_norm_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(spectral_norm,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::SpectralNormKernel,
+                          float,
+                          double) {}
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 184599263fa..682cee35caf 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -1028,6 +1028,468 @@ index 6f03f76eeb..5fe2c3e7dc 100644
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
  
+diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
+index 4099d8b506..baef2cd643 100644
+--- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
++++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
+@@ -14,7 +14,7 @@
+ 
+ #pragma once
+ 
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/eigen/common.h"
+ #include "paddle/phi/kernels/funcs/math_function.h"
+ 
+diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py
+index 4a5660ea0e..ca4e456e02 100644
+--- a/test/legacy_test/test_batch_norm_op.py
++++ b/test/legacy_test/test_batch_norm_op.py
+@@ -22,7 +22,9 @@ from op_test import (
+     _set_use_system_allocator,
+     convert_float_to_uint16,
+     convert_uint16_to_float,
+-    get_places,
++    get_devices,
++    is_custom_device,
++    get_device_place,
+ )
+ 
+ import paddle
+@@ -189,6 +191,7 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
+ 
+ 
+ def create_or_get_tensor(scope, var_name, var, place):
++    
+     tensor = scope.var(var_name).get_tensor()
+     if var is not None:
+         assert isinstance(var, np.ndarray)
+@@ -321,7 +324,6 @@ class TestBatchNormOpInference(unittest.TestCase):
+             fuse_with_relu=self.fuse_with_relu,
+             epsilon=epsilon,
+         )
+-
+         batch_norm_op.run(scope, place)
+ 
+         # When op is called without Executor then
+@@ -454,7 +456,7 @@ class TestBatchNormOpInference(unittest.TestCase):
+         )
+ 
+     def test_check_output(self):
+-        for place in get_places():
++        for place in get_devices():
+             for data_format in ["NCHW", "NHWC"]:
+                 self.check_with_place(
+                     place,
+@@ -488,8 +490,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+ 
+     def test_check_output(self):
+         places = []
+-        if core.is_compiled_with_cuda():
+-            place = core.CUDAPlace(0)
++        if core.is_compiled_with_cuda() or is_custom_device():
++            place = get_device_place()
+             if core.is_float16_supported(place):
+                 places.append(place)
+         for place in places:
+@@ -510,8 +512,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda()
+-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
++    not (core.is_compiled_with_cuda() or is_custom_device())
++    or not core.is_bfloat16_supported(get_device_place()),
+     "core is not compiled with CUDA or not support the bfloat16",
+ )
+ class TestBF16BatchNormOpInference(TestBatchNormOpInference):
+@@ -522,7 +524,7 @@ class TestBF16BatchNormOpInference(TestBatchNormOpInference):
+         self.init_kernel_type()
+ 
+     def test_check_output(self):
+-        places = [core.CUDAPlace(0)]
++        places = [get_device_place()]
+         for place in places:
+             # for data_format in ["NCHW", "NHWC"]:
+             for data_format in ["NCHW"]:
+@@ -562,7 +564,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
+ 
+ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
+     def test_dygraph(self):
+-        for p in get_places():
++        for p in get_devices():
+             shape = [4, 10, 4, 4]
+ 
+             def compute(x, is_test, trainable_statistics):
+@@ -581,7 +583,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
+             np.testing.assert_allclose(y1, y2, rtol=1e-05)
+ 
+     def test_static(self):
+-        for p in get_places():
++        for p in get_devices():
+             exe = base.Executor(p)
+             shape = [4, 10, 16, 16]
+ 
+@@ -625,7 +627,7 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
+ 
+ class TestBatchNormAPI_ZeroSize(unittest.TestCase):
+     def setUp(self):
+-        self.places = get_places()
++        self.places = get_devices()
+ 
+     def test_dygraph(self):
+         for place in self.places:
+diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py
+index c9853e9073..277eb26d00 100644
+--- a/test/legacy_test/test_conv3d_transpose_op.py
++++ b/test/legacy_test/test_conv3d_transpose_op.py
+@@ -19,7 +19,7 @@ import numpy as np
+ import paddle
+ 
+ paddle.enable_static()
+-from op_test import OpTest, copy_bits_from_float_to_uint16
++from op_test import OpTest, copy_bits_from_float_to_uint16, is_custom_device, get_devices, get_device_place
+ 
+ from paddle.base import core
+ 
+@@ -150,7 +150,7 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
+ 
+ def create_test_cudnn_fp16_class(parent, grad_check=True):
+     @unittest.skipIf(
+-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++        not ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()), "core is not compiled with CUDA"
+     )
+     class TestConv3DTransposeCUDNNFP16(parent):
+         def init_kernel_type(self):
+@@ -158,20 +158,20 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
+             self.dtype = np.float16
+ 
+         def test_check_output(self):
+-            if core.is_compiled_with_cuda():
+-                place = core.CUDAPlace(0)
++            if ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()):
++                place = get_device_place()
+                 if core.is_float16_supported(place):
+                     self.check_output_with_place(place, atol=2e-2)
+ 
+         def test_check_grad_no_filter(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place) and grad_check:
+                 self.check_grad_with_place(
+                     place, ['Input'], 'Output', no_grad_set={'Filter'}
+                 )
+ 
+         def test_check_grad_no_input(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place) and grad_check:
+                 self.check_grad_with_place(
+                     place, ['Filter'], 'Output', no_grad_set={'Input'}
+@@ -184,8 +184,8 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
+ 
+ def create_test_cudnn_bf16_class(parent):
+     @unittest.skipIf(
+-        not core.is_compiled_with_cuda()
+-        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
++        not (core.is_compiled_with_cuda() or is_custom_device())
++        or not core.is_bfloat16_supported(get_device_place()),
+         "core is not compiled with CUDA and do not support bfloat16",
+     )
+     class TestConv3DTransposeCUDNNBF16(parent):
+@@ -194,11 +194,11 @@ def create_test_cudnn_bf16_class(parent):
+             self.dtype = np.uint16
+ 
+         def test_check_output(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_output_with_place(place)
+ 
+         def test_check_grad(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 {'Input', 'Filter'},
+@@ -206,7 +206,7 @@ def create_test_cudnn_bf16_class(parent):
+             )
+ 
+         def test_check_grad_no_filter(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Input'],
+@@ -215,7 +215,7 @@ def create_test_cudnn_bf16_class(parent):
+             )
+ 
+         def test_check_grad_no_input(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Filter'],
+@@ -306,14 +306,14 @@ class TestConv3DTransposeOp(OpTest):
+ 
+     def test_check_output(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_output_with_place(place, atol=1e-5)
+         else:
+             self.check_output()
+ 
+     def test_check_grad(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 {'Input', 'Filter'},
+@@ -327,7 +327,7 @@ class TestConv3DTransposeOp(OpTest):
+ 
+     def test_check_grad_no_filter(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Input'],
+@@ -345,7 +345,7 @@ class TestConv3DTransposeOp(OpTest):
+ 
+     def test_check_grad_no_input(self):
+         if self.use_cudnn:
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             self.check_grad_with_place(
+                 place,
+                 ['Filter'],
+@@ -471,7 +471,7 @@ class Test_NHWC(TestConv3DTransposeOp):
+ 
+ # ------------ test_cudnn ------------
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNN(TestConv3DTransposeOp):
+     def init_op_type(self):
+@@ -481,7 +481,7 @@ class TestCUDNN(TestConv3DTransposeOp):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
+     def init_test_case(self):
+@@ -500,7 +500,7 @@ class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
+     def init_test_case(self):
+@@ -519,7 +519,7 @@ class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithSAMEPad(TestWithSAMEPad):
+     def init_test_case(self):
+@@ -538,7 +538,7 @@ class TestCUDNNWithSAMEPad(TestWithSAMEPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithVALIDPad(TestWithVALIDPad):
+     def init_test_case(self):
+@@ -557,7 +557,7 @@ class TestCUDNNWithVALIDPad(TestWithVALIDPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithStride(TestWithStride):
+     def init_test_case(self):
+@@ -576,7 +576,7 @@ class TestCUDNNWithStride(TestWithStride):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithGroups(TestWithGroups):
+     def init_test_case(self):
+@@ -610,7 +610,7 @@ class TestCUDNNWithGroups(TestWithGroups):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNN_NHWC(TestConv3DTransposeOp):
+     def init_test_case(self):
+@@ -630,7 +630,7 @@ class TestCUDNN_NHWC(TestConv3DTransposeOp):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
+     def init_test_case(self):
+@@ -650,7 +650,7 @@ class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
+     def init_test_case(self):
+@@ -670,7 +670,7 @@ class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithStride_NHWC(TestWithStride):
+     def init_test_case(self):
+@@ -690,7 +690,7 @@ class TestCUDNNWithStride_NHWC(TestWithStride):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestCUDNNWithGroups_NHWC(TestWithGroups):
+     def init_test_case(self):
+diff --git a/test/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py
+index 74eedb6a48..e4c6ecb98a 100644
+--- a/test/legacy_test/test_cross_entropy_op.py
++++ b/test/legacy_test/test_cross_entropy_op.py
+@@ -20,6 +20,8 @@ from op_test import (
+     get_places,
+     paddle_static_guard,
+     randomize_probability,
++    is_custom_device,
++    get_device_place,
+ )
+ 
+ import paddle
+@@ -385,19 +387,19 @@ class TestCrossEntropyOp7RemoveLastDim(TestCrossEntropyOp7):
+ # Add Fp16 test
+ def create_test_class(parent, cls_name):
+     @unittest.skipIf(
+-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++        not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+     )
+     class TestCrossEntropyFP16Op(parent):
+         def init_dtype_type(self):
+             return np.float16
+ 
+         def test_check_output(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place):
+                 self.check_output_with_place(place, atol=2e-1)
+ 
+         def test_check_grad(self):
+-            place = core.CUDAPlace(0)
++            place = get_device_place()
+             if core.is_float16_supported(place):
+                 self.check_grad_with_place(
+                     place, ['X'], 'Y', max_relative_error=0.9
+diff --git a/test/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py
+index 4c9944e877..e6ed5c0f8e 100644
+--- a/test/legacy_test/test_fmin_op.py
++++ b/test/legacy_test/test_fmin_op.py
+@@ -15,8 +15,7 @@
+ import unittest
+ 
+ import numpy as np
+-from op_test import OpTest, convert_float_to_uint16
+-
++from op_test import OpTest, convert_float_to_uint16, is_custom_device, get_devices, get_device_place
+ import paddle
+ from paddle.base import core
+ 
+@@ -28,8 +27,8 @@ class ApiFMinTest(unittest.TestCase):
+ 
+     def setUp(self):
+         """setUp"""
+-        if core.is_compiled_with_cuda():
+-            self.place = core.CUDAPlace(0)
++        if core.is_compiled_with_cuda() or is_custom_device():
++            self.place = get_device_place()
+         else:
+             self.place = core.CPUPlace()
+ 
+@@ -259,8 +258,8 @@ class TestElementwiseFmin3Op(OpTest):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda()
+-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
++    not (core.is_compiled_with_cuda() or is_custom_device())
++    or not core.is_bfloat16_supported(get_device_place()),
+     "core is not compiled with CUDA and not support the bfloat16",
+ )
+ class TestFminBF16OP(OpTest):
+@@ -281,13 +280,13 @@ class TestFminBF16OP(OpTest):
+         self.outputs = {'Out': convert_float_to_uint16(out)}
+ 
+     def test_check_output(self):
+-        place = core.CUDAPlace(0)
++        place = get_device_place()
+         self.check_output_with_place(
+             place, check_pir=True, check_symbol_infer=False
+         )
+ 
+     def test_check_grad(self):
+-        place = core.CUDAPlace(0)
++        place = get_device_place()
+         self.check_grad_with_place(
+             place, ['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True
+         )
+@@ -304,7 +303,7 @@ class TestElementwiseFminOpZeroSize1(TestElementwiseFminOp):
+ 
+ 
+ @unittest.skipIf(
+-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
++    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
+ )
+ class TestElementwiseFminOp_Stride(OpTest):
+     no_need_check_grad = True
+@@ -335,7 +334,7 @@ class TestElementwiseFminOp_Stride(OpTest):
+         self.val_dtype = np.float64
+ 
+     def test_check_output(self):
+-        place = core.CUDAPlace(0)
++        place = get_device_place()
+         self.check_strided_forward = True
+         self.check_output(
+             place,
+diff --git a/test/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py
+index 80e5c2ec63..f1602a8b40 100644
+--- a/test/legacy_test/test_spectral_norm_op.py
++++ b/test/legacy_test/test_spectral_norm_op.py
+@@ -112,6 +112,7 @@ class TestSpectralNormOpNoGrad2(TestSpectralNormOpNoGrad):
+ 
+ class TestSpectralNormOp(TestSpectralNormOpNoGrad):
+     def test_check_grad_ignore_uv(self):
++        
+         self.check_grad(
+             ['Weight'],
+             'Out',
 diff --git a/third_party/flagcx b/third_party/flagcx
 index 77495cd6a8..7e6c4cc3ca 160000
 --- a/third_party/flagcx

From a1530d2b4a9837dc9975fff03fac774a45ea702d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 9 Sep 2025 15:41:45 +0800
Subject: [PATCH 02/95] [metax]change_cupti_and_fix_softmax (#7)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/kernels/funcs/softmax.cu   | 168 ++++++
 .../cross_entropy_grad_kernel_register.cu     |  10 +-
 backends/metax_gpu/patch/paddle.patch         | 511 ++----------------
 .../metax_gpu/runtime/process_cupti_data.cc   | 136 +++--
 4 files changed, 309 insertions(+), 516 deletions(-)
 create mode 100644 backends/metax_gpu/kernels/funcs/softmax.cu

diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu
new file mode 100644
index 00000000000..d738a53f43a
--- /dev/null
+++ b/backends/metax_gpu/kernels/funcs/softmax.cu
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <vector>
+
+#include "kernels/metax_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
+#include "paddle/phi/kernels/funcs/softmax_impl.h"
+
+namespace phi {
+namespace funcs {
+
+using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor;
+using DataLayout = phi::backends::gpu::DataLayout;
+template <typename T>
+using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+
+template <typename T, typename DeviceContext>
+void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
+    const DeviceContext& dev_ctx,
+    const phi::DenseTensor* X,
+    phi::DenseTensor* Y) {
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor xDesc;
+  ScopedTensorDescriptor yDesc;
+  std::vector<int> cudnn_tensor_dims = common::vectorize<int>(X->dims());
+  DataLayout layout = DataLayout::kNCHW;
+  if (cudnn_tensor_dims.size() == 5) {
+    layout = DataLayout::kNCDHW;
+  }
+  // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor,
+  // fill 1 at unused dims
+  if (cudnn_tensor_dims.size() <= 2) {
+    cudnn_tensor_dims.resize(4, 1);
+  }
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_x_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  miopenTensorDescriptor_t cudnn_y_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenSoftmaxForward_V2(dev_ctx.cudnn_handle(),
+                                            CudnnDataType<T>::kOne(),
+                                            cudnn_x_desc,
+                                            X->data<T>(),
+                                            CudnnDataType<T>::kZero(),
+                                            cudnn_y_desc,
+                                            dev_ctx.template Alloc<T>(Y),
+                                            MIOPEN_SOFTMAX_ACCURATE,
+                                            MIOPEN_SOFTMAX_MODE_INSTANCE));
+#else
+  cudnnTensorDescriptor_t cudnn_x_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_y_desc =
+      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward(
+      GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+      CUDNN_SOFTMAX_ACCURATE,
+      CUDNN_SOFTMAX_MODE_INSTANCE,
+      CudnnDataType<T>::kOne(),
+      cudnn_x_desc,
+      X->data<T>(),
+      CudnnDataType<T>::kZero(),
+      cudnn_y_desc,
+      dev_ctx.template Alloc<T>(Y)));
+#endif
+}
+
+template <typename T, typename DeviceContext>
+void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
+    const DeviceContext& dev_ctx,
+    const phi::DenseTensor* Y,
+    const phi::DenseTensor* YGrad,
+    phi::DenseTensor* XGrad) {
+  // ------------------- cudnn descriptors ---------------------
+  ScopedTensorDescriptor yDesc;
+  ScopedTensorDescriptor dyDesc;
+  ScopedTensorDescriptor dxDesc;
+  std::vector<int> cudnn_tensor_dims = common::vectorize<int>(Y->dims());
+  DataLayout layout = DataLayout::kNCHW;
+  if (cudnn_tensor_dims.size() == 5) {
+    layout = DataLayout::kNCDHW;
+  }
+  // NOTE(*) : cudnn softmax only support >= 4D phi::DenseTensor,
+  // fill 1 at unused dims
+  if (cudnn_tensor_dims.size() <= 2) {
+    cudnn_tensor_dims.resize(4, 1);
+  }
+#ifdef PADDLE_WITH_HIP
+  miopenTensorDescriptor_t cudnn_y_desc =
+      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  miopenTensorDescriptor_t cudnn_xgrad_desc =
+      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  miopenTensorDescriptor_t cudnn_ygrad_desc =
+      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxBackward_V2(
+      GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+      CudnnDataType<T>::kOne(),
+      cudnn_y_desc,
+      Y->data<T>(),
+      cudnn_ygrad_desc,
+      YGrad->data<T>(),
+      CudnnDataType<T>::kZero(),
+      cudnn_xgrad_desc,
+      dev_ctx.template Alloc<T>(XGrad),
+      MIOPEN_SOFTMAX_ACCURATE,
+      MIOPEN_SOFTMAX_MODE_INSTANCE));
+#else
+  cudnnTensorDescriptor_t cudnn_y_desc =
+      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_xgrad_desc =
+      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  cudnnTensorDescriptor_t cudnn_ygrad_desc =
+      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxBackward(
+      GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace()),
+      CUDNN_SOFTMAX_ACCURATE,
+      CUDNN_SOFTMAX_MODE_INSTANCE,
+      CudnnDataType<T>::kOne(),
+      cudnn_y_desc,
+      Y->data<T>(),
+      cudnn_ygrad_desc,
+      YGrad->data<T>(),
+      CudnnDataType<T>::kZero(),
+      cudnn_xgrad_desc,
+      dev_ctx.template Alloc<T>(XGrad)));
+#endif
+}
+
+template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
+template class SoftmaxCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
+#if CUDNN_VERSION_MIN(8, 1, 0)
+template class SoftmaxCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
+#endif
+
+// MIOPEN do not support double
+#ifndef PADDLE_WITH_HIP
+template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
+#endif
+
+template class SoftmaxFunctor<phi::GPUContext, phi::dtype::float16>;
+template class SoftmaxFunctor<phi::GPUContext, phi::dtype::bfloat16>;
+template class SoftmaxFunctor<phi::GPUContext, float>;
+template class SoftmaxFunctor<phi::GPUContext, double>;
+template class SoftmaxGradFunctor<phi::GPUContext, float>;
+template class SoftmaxGradFunctor<phi::GPUContext, double>;
+template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::float16>;
+template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::bfloat16>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu
index b5de9dd8f3c..402f69a9958 100644
--- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_grad_kernel_register.cu
@@ -149,11 +149,11 @@ void CrossEntropyWithSoftmaxGradGPUKernel(const GPUContext& dev_ctx,
                                           int ignore_index,
                                           int axis,
                                           DenseTensor* logits_grad) {
-  PADDLE_ENFORCE_EQ(
-      dev_ctx.GetPlace().GetType(),
-      phi::AllocationType::GPU,
-      common::errors::Unavailable("softmax_with_cross_entropy operator's "
-                                  "CUDA kernel only runs on GPU device."));
+  // PADDLE_ENFORCE_EQ(
+  //     dev_ctx.GetPlace().GetType(),
+  //     phi::AllocationType::GPU,
+  //     common::errors::Unavailable("softmax_with_cross_entropy operator's "
+  //                                 "CUDA kernel only runs on GPU device."));
   const T* loss_grad_data = loss_grad.data<T>();
   DenseTensor* logit_grad = logits_grad;
 
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 682cee35caf..1935217baa0 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -419,7 +419,7 @@ index d69eb67d6f..1d8b6e9375 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
  
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
-index bdfd7313af..546bd07d5e 100644
+index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
@@ -438,7 +438,7 @@ index bdfd7313af..546bd07d5e 100644
  #include "paddle/phi/kernels/matmul_kernel.h"
  
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
-index 1a9a9cfb85..08ebe4b8af 100644
+index e101224970..a52eb6096f 100644
 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
@@ -470,10 +470,10 @@ index 558d363b39..05da04b517 100644
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
  
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
-index dc7935423c..84896c2214 100644
+index e30d440ff3..3c74792690 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
 +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
-@@ -32,11 +32,11 @@ limitations under the License. */
+@@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
  
@@ -487,7 +487,7 @@ index dc7935423c..84896c2214 100644
  #endif
  #define MAX_NUM_THREADS 1024
  
-@@ -200,21 +200,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
+@@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
        if (topk[k] < p) {
@@ -549,7 +549,7 @@ index dc7935423c..84896c2214 100644
  }
  
  template <typename T, int BlockSize>
-@@ -243,24 +278,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
+@@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
  __device__ __forceinline__ void GetTopK(Pair<T> topk[],
                                          const T* src,
@@ -581,7 +581,7 @@ index dc7935423c..84896c2214 100644
          }
        }
      }
-@@ -287,7 +322,9 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
+@@ -283,7 +318,9 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
      } else {
        for (int k = 0; k < MaxLength; k++) {
          if (k < MaxLength - (*beam)) {
@@ -592,7 +592,7 @@ index dc7935423c..84896c2214 100644
          } else {
            if (largest) {
              topk[k].set(-static_cast<T>(INFINITY), -1);
-@@ -297,8 +334,10 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
+@@ -293,8 +330,10 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
          }
        }
        if (!(*is_empty)) {
@@ -604,7 +604,7 @@ index dc7935423c..84896c2214 100644
        }
      }
  
-@@ -359,6 +398,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
+@@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
      __syncthreads();
@@ -613,7 +613,7 @@ index dc7935423c..84896c2214 100644
      if (largest) {
        input_now = (tid < BlockSize / WARP_SIZE)
                        ? shared_max[lane]
-@@ -373,27 +414,32 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
+@@ -369,27 +410,32 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        if (lane == 0) shared_max[0] = input_now;
      }
      __syncthreads();
@@ -652,7 +652,7 @@ index dc7935423c..84896c2214 100644
          break;
      }
    }
-@@ -482,16 +528,17 @@ struct Bitfield<unsigned int> {
+@@ -478,16 +524,17 @@ struct Bitfield<unsigned int> {
                                                               int pos,
                                                               int len) {
      unsigned int ret;
@@ -674,7 +674,7 @@ index dc7935423c..84896c2214 100644
      return ret;
    }
  };
-@@ -502,7 +549,9 @@ struct Bitfield<uint64_t> {
+@@ -498,7 +545,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
      uint64_t ret;
@@ -685,7 +685,7 @@ index dc7935423c..84896c2214 100644
      return ret;
    }
  
-@@ -511,9 +560,9 @@ struct Bitfield<uint64_t> {
+@@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
      uint64_t ret;
@@ -698,7 +698,7 @@ index dc7935423c..84896c2214 100644
      return ret;
    }
  };
-@@ -631,14 +680,20 @@ struct RadixTypeConfig<phi::dtype::bfloat16> {
+@@ -627,14 +676,20 @@ struct RadixTypeConfig<phi::bfloat16> {
  /*---------------------------Helper Functions------------------*/
  __device__ __forceinline__ int GetLaneId() {
    int lane_id;
@@ -723,7 +723,7 @@ index dc7935423c..84896c2214 100644
  }
  
  template <typename T, bool KillDependency, class Function>
-@@ -885,7 +940,8 @@ __global__ void GatherKthValue(const T* input,
+@@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
  
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
@@ -733,13 +733,13 @@ index dc7935423c..84896c2214 100644
        cur_input, k, num_cols, shared_mem, &kth_value);
  
    __shared__ int64_t block_min_idx;
-@@ -1318,3 +1374,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
+@@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
  }  // namespace funcs
  }  // namespace phi
 +//
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
-index 45a29b4cff..8449e3d309 100644
+index 32db61532f..0220316bc3 100644
 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
@@ -752,7 +752,7 @@ index 45a29b4cff..8449e3d309 100644
  
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
-index 7d05bcb654..c79cdadabc 100644
+index 9d4bb18d55..ea42cc10a9 100644
 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 @@ -638,9 +638,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
@@ -767,11 +767,11 @@ index 7d05bcb654..c79cdadabc 100644
          }
        }
 diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
-index ad04265bd6..59481d0e6a 100644
+index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
-@@ -15,7 +15,7 @@
- #include "paddle/phi/common/bfloat16.h"
+@@ -14,7 +14,7 @@
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
@@ -780,11 +780,11 @@ index ad04265bd6..59481d0e6a 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
-index 148d72ca9c..5da3461ebf 100644
+index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
-@@ -15,7 +15,7 @@
- #include "paddle/phi/common/bfloat16.h"
+@@ -14,7 +14,7 @@
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
@@ -793,7 +793,7 @@ index 148d72ca9c..5da3461ebf 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
-index b16553589a..90080c375d 100644
+index f0cca0f701..02ea957240 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
 +++ b/paddle/phi/kernels/gpu/depthwise_conv.h
 @@ -29,8 +29,8 @@ namespace cub = hipcub;
@@ -833,7 +833,7 @@ index 29fa252e96..4ae72b0935 100644
  }
  
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
-index ee71a2b452..69130ab955 100644
+index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 +++ b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 @@ -17,7 +17,7 @@
@@ -846,7 +846,7 @@ index ee71a2b452..69130ab955 100644
  namespace phi {
  
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
-index 00a2f1e210..1267cf7ec2 100644
+index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 +++ b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 @@ -17,7 +17,7 @@
@@ -872,7 +872,7 @@ index 1bdbe1564c..f753b54bc6 100644
  #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
  #include "paddle/phi/kernels/lstsq_kernel.h"
 diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
-index 14b24dd3ed..e54a342c98 100644
+index 9bc5326c90..79b57a8203 100644
 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 @@ -21,7 +21,7 @@ limitations under the License. */
@@ -885,7 +885,7 @@ index 14b24dd3ed..e54a342c98 100644
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
 diff --git a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
-index 06fff0dd58..973049105f 100644
+index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
@@ -1041,461 +1041,12 @@ index 4099d8b506..baef2cd643 100644
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  
-diff --git a/test/legacy_test/test_batch_norm_op.py b/test/legacy_test/test_batch_norm_op.py
-index 4a5660ea0e..ca4e456e02 100644
---- a/test/legacy_test/test_batch_norm_op.py
-+++ b/test/legacy_test/test_batch_norm_op.py
-@@ -22,7 +22,9 @@ from op_test import (
-     _set_use_system_allocator,
-     convert_float_to_uint16,
-     convert_uint16_to_float,
--    get_places,
-+    get_devices,
-+    is_custom_device,
-+    get_device_place,
- )
- 
- import paddle
-@@ -189,6 +191,7 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
- 
- 
- def create_or_get_tensor(scope, var_name, var, place):
-+    
-     tensor = scope.var(var_name).get_tensor()
-     if var is not None:
-         assert isinstance(var, np.ndarray)
-@@ -321,7 +324,6 @@ class TestBatchNormOpInference(unittest.TestCase):
-             fuse_with_relu=self.fuse_with_relu,
-             epsilon=epsilon,
-         )
--
-         batch_norm_op.run(scope, place)
- 
-         # When op is called without Executor then
-@@ -454,7 +456,7 @@ class TestBatchNormOpInference(unittest.TestCase):
-         )
- 
-     def test_check_output(self):
--        for place in get_places():
-+        for place in get_devices():
-             for data_format in ["NCHW", "NHWC"]:
-                 self.check_with_place(
-                     place,
-@@ -488,8 +490,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
- 
-     def test_check_output(self):
-         places = []
--        if core.is_compiled_with_cuda():
--            place = core.CUDAPlace(0)
-+        if core.is_compiled_with_cuda() or is_custom_device():
-+            place = get_device_place()
-             if core.is_float16_supported(place):
-                 places.append(place)
-         for place in places:
-@@ -510,8 +512,8 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda()
--    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-+    not (core.is_compiled_with_cuda() or is_custom_device())
-+    or not core.is_bfloat16_supported(get_device_place()),
-     "core is not compiled with CUDA or not support the bfloat16",
- )
- class TestBF16BatchNormOpInference(TestBatchNormOpInference):
-@@ -522,7 +524,7 @@ class TestBF16BatchNormOpInference(TestBatchNormOpInference):
-         self.init_kernel_type()
- 
-     def test_check_output(self):
--        places = [core.CUDAPlace(0)]
-+        places = [get_device_place()]
-         for place in places:
-             # for data_format in ["NCHW", "NHWC"]:
-             for data_format in ["NCHW"]:
-@@ -562,7 +564,7 @@ class TestDygraphBatchNormAPIError(unittest.TestCase):
- 
- class TestDygraphBatchNormTrainableStats(unittest.TestCase):
-     def test_dygraph(self):
--        for p in get_places():
-+        for p in get_devices():
-             shape = [4, 10, 4, 4]
- 
-             def compute(x, is_test, trainable_statistics):
-@@ -581,7 +583,7 @@ class TestDygraphBatchNormTrainableStats(unittest.TestCase):
-             np.testing.assert_allclose(y1, y2, rtol=1e-05)
- 
-     def test_static(self):
--        for p in get_places():
-+        for p in get_devices():
-             exe = base.Executor(p)
-             shape = [4, 10, 16, 16]
- 
-@@ -625,7 +627,7 @@ class TestDygraphBatchNormOpenReserveSpace(unittest.TestCase):
- 
- class TestBatchNormAPI_ZeroSize(unittest.TestCase):
-     def setUp(self):
--        self.places = get_places()
-+        self.places = get_devices()
- 
-     def test_dygraph(self):
-         for place in self.places:
-diff --git a/test/legacy_test/test_conv3d_transpose_op.py b/test/legacy_test/test_conv3d_transpose_op.py
-index c9853e9073..277eb26d00 100644
---- a/test/legacy_test/test_conv3d_transpose_op.py
-+++ b/test/legacy_test/test_conv3d_transpose_op.py
-@@ -19,7 +19,7 @@ import numpy as np
- import paddle
- 
- paddle.enable_static()
--from op_test import OpTest, copy_bits_from_float_to_uint16
-+from op_test import OpTest, copy_bits_from_float_to_uint16, is_custom_device, get_devices, get_device_place
- 
- from paddle.base import core
- 
-@@ -150,7 +150,7 @@ def conv3dtranspose_forward_naive(input_, filter_, attrs):
- 
- def create_test_cudnn_fp16_class(parent, grad_check=True):
-     @unittest.skipIf(
--        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+        not ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()), "core is not compiled with CUDA"
-     )
-     class TestConv3DTransposeCUDNNFP16(parent):
-         def init_kernel_type(self):
-@@ -158,20 +158,20 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
-             self.dtype = np.float16
- 
-         def test_check_output(self):
--            if core.is_compiled_with_cuda():
--                place = core.CUDAPlace(0)
-+            if ((core.is_compiled_with_cuda() or is_custom_device()) or is_custom_device()):
-+                place = get_device_place()
-                 if core.is_float16_supported(place):
-                     self.check_output_with_place(place, atol=2e-2)
- 
-         def test_check_grad_no_filter(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place) and grad_check:
-                 self.check_grad_with_place(
-                     place, ['Input'], 'Output', no_grad_set={'Filter'}
-                 )
- 
-         def test_check_grad_no_input(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place) and grad_check:
-                 self.check_grad_with_place(
-                     place, ['Filter'], 'Output', no_grad_set={'Input'}
-@@ -184,8 +184,8 @@ def create_test_cudnn_fp16_class(parent, grad_check=True):
- 
- def create_test_cudnn_bf16_class(parent):
-     @unittest.skipIf(
--        not core.is_compiled_with_cuda()
--        or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-+        not (core.is_compiled_with_cuda() or is_custom_device())
-+        or not core.is_bfloat16_supported(get_device_place()),
-         "core is not compiled with CUDA and do not support bfloat16",
-     )
-     class TestConv3DTransposeCUDNNBF16(parent):
-@@ -194,11 +194,11 @@ def create_test_cudnn_bf16_class(parent):
-             self.dtype = np.uint16
- 
-         def test_check_output(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_output_with_place(place)
- 
-         def test_check_grad(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 {'Input', 'Filter'},
-@@ -206,7 +206,7 @@ def create_test_cudnn_bf16_class(parent):
-             )
- 
-         def test_check_grad_no_filter(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Input'],
-@@ -215,7 +215,7 @@ def create_test_cudnn_bf16_class(parent):
-             )
- 
-         def test_check_grad_no_input(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Filter'],
-@@ -306,14 +306,14 @@ class TestConv3DTransposeOp(OpTest):
- 
-     def test_check_output(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_output_with_place(place, atol=1e-5)
-         else:
-             self.check_output()
- 
-     def test_check_grad(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 {'Input', 'Filter'},
-@@ -327,7 +327,7 @@ class TestConv3DTransposeOp(OpTest):
- 
-     def test_check_grad_no_filter(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Input'],
-@@ -345,7 +345,7 @@ class TestConv3DTransposeOp(OpTest):
- 
-     def test_check_grad_no_input(self):
-         if self.use_cudnn:
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             self.check_grad_with_place(
-                 place,
-                 ['Filter'],
-@@ -471,7 +471,7 @@ class Test_NHWC(TestConv3DTransposeOp):
- 
- # ------------ test_cudnn ------------
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNN(TestConv3DTransposeOp):
-     def init_op_type(self):
-@@ -481,7 +481,7 @@ class TestCUDNN(TestConv3DTransposeOp):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
-     def init_test_case(self):
-@@ -500,7 +500,7 @@ class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
-     def init_test_case(self):
-@@ -519,7 +519,7 @@ class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithSAMEPad(TestWithSAMEPad):
-     def init_test_case(self):
-@@ -538,7 +538,7 @@ class TestCUDNNWithSAMEPad(TestWithSAMEPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithVALIDPad(TestWithVALIDPad):
-     def init_test_case(self):
-@@ -557,7 +557,7 @@ class TestCUDNNWithVALIDPad(TestWithVALIDPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithStride(TestWithStride):
-     def init_test_case(self):
-@@ -576,7 +576,7 @@ class TestCUDNNWithStride(TestWithStride):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithGroups(TestWithGroups):
-     def init_test_case(self):
-@@ -610,7 +610,7 @@ class TestCUDNNWithGroups(TestWithGroups):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNN_NHWC(TestConv3DTransposeOp):
-     def init_test_case(self):
-@@ -630,7 +630,7 @@ class TestCUDNN_NHWC(TestConv3DTransposeOp):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
-     def init_test_case(self):
-@@ -650,7 +650,7 @@ class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
-     def init_test_case(self):
-@@ -670,7 +670,7 @@ class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithStride_NHWC(TestWithStride):
-     def init_test_case(self):
-@@ -690,7 +690,7 @@ class TestCUDNNWithStride_NHWC(TestWithStride):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestCUDNNWithGroups_NHWC(TestWithGroups):
-     def init_test_case(self):
-diff --git a/test/legacy_test/test_cross_entropy_op.py b/test/legacy_test/test_cross_entropy_op.py
-index 74eedb6a48..e4c6ecb98a 100644
---- a/test/legacy_test/test_cross_entropy_op.py
-+++ b/test/legacy_test/test_cross_entropy_op.py
-@@ -20,6 +20,8 @@ from op_test import (
-     get_places,
-     paddle_static_guard,
-     randomize_probability,
-+    is_custom_device,
-+    get_device_place,
- )
- 
- import paddle
-@@ -385,19 +387,19 @@ class TestCrossEntropyOp7RemoveLastDim(TestCrossEntropyOp7):
- # Add Fp16 test
- def create_test_class(parent, cls_name):
-     @unittest.skipIf(
--        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+        not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
-     )
-     class TestCrossEntropyFP16Op(parent):
-         def init_dtype_type(self):
-             return np.float16
- 
-         def test_check_output(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place):
-                 self.check_output_with_place(place, atol=2e-1)
- 
-         def test_check_grad(self):
--            place = core.CUDAPlace(0)
-+            place = get_device_place()
-             if core.is_float16_supported(place):
-                 self.check_grad_with_place(
-                     place, ['X'], 'Y', max_relative_error=0.9
-diff --git a/test/legacy_test/test_fmin_op.py b/test/legacy_test/test_fmin_op.py
-index 4c9944e877..e6ed5c0f8e 100644
---- a/test/legacy_test/test_fmin_op.py
-+++ b/test/legacy_test/test_fmin_op.py
-@@ -15,8 +15,7 @@
- import unittest
- 
- import numpy as np
--from op_test import OpTest, convert_float_to_uint16
--
-+from op_test import OpTest, convert_float_to_uint16, is_custom_device, get_devices, get_device_place
- import paddle
- from paddle.base import core
- 
-@@ -28,8 +27,8 @@ class ApiFMinTest(unittest.TestCase):
- 
-     def setUp(self):
-         """setUp"""
--        if core.is_compiled_with_cuda():
--            self.place = core.CUDAPlace(0)
-+        if core.is_compiled_with_cuda() or is_custom_device():
-+            self.place = get_device_place()
-         else:
-             self.place = core.CPUPlace()
- 
-@@ -259,8 +258,8 @@ class TestElementwiseFmin3Op(OpTest):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda()
--    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-+    not (core.is_compiled_with_cuda() or is_custom_device())
-+    or not core.is_bfloat16_supported(get_device_place()),
-     "core is not compiled with CUDA and not support the bfloat16",
- )
- class TestFminBF16OP(OpTest):
-@@ -281,13 +280,13 @@ class TestFminBF16OP(OpTest):
-         self.outputs = {'Out': convert_float_to_uint16(out)}
- 
-     def test_check_output(self):
--        place = core.CUDAPlace(0)
-+        place = get_device_place()
-         self.check_output_with_place(
-             place, check_pir=True, check_symbol_infer=False
-         )
- 
-     def test_check_grad(self):
--        place = core.CUDAPlace(0)
-+        place = get_device_place()
-         self.check_grad_with_place(
-             place, ['X', 'Y'], 'Out', check_pir=True, check_prim_pir=True
-         )
-@@ -304,7 +303,7 @@ class TestElementwiseFminOpZeroSize1(TestElementwiseFminOp):
- 
- 
- @unittest.skipIf(
--    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
-+    not (core.is_compiled_with_cuda() or is_custom_device()), "core is not compiled with CUDA"
- )
- class TestElementwiseFminOp_Stride(OpTest):
-     no_need_check_grad = True
-@@ -335,7 +334,7 @@ class TestElementwiseFminOp_Stride(OpTest):
-         self.val_dtype = np.float64
- 
-     def test_check_output(self):
--        place = core.CUDAPlace(0)
-+        place = get_device_place()
-         self.check_strided_forward = True
-         self.check_output(
-             place,
-diff --git a/test/legacy_test/test_spectral_norm_op.py b/test/legacy_test/test_spectral_norm_op.py
-index 80e5c2ec63..f1602a8b40 100644
---- a/test/legacy_test/test_spectral_norm_op.py
-+++ b/test/legacy_test/test_spectral_norm_op.py
-@@ -112,6 +112,7 @@ class TestSpectralNormOpNoGrad2(TestSpectralNormOpNoGrad):
- 
- class TestSpectralNormOp(TestSpectralNormOpNoGrad):
-     def test_check_grad_ignore_uv(self):
-+        
-         self.check_grad(
-             ['Weight'],
-             'Out',
 diff --git a/third_party/flagcx b/third_party/flagcx
-index 77495cd6a8..7e6c4cc3ca 160000
+index 7c469f4af9..7e6c4cc3ca 160000
 --- a/third_party/flagcx
 +++ b/third_party/flagcx
 @@ -1 +1 @@
--Subproject commit 77495cd6a84b1c8f88dd8f6f99e63ef3c84c766f
+-Subproject commit 7c469f4af991bf0f64b8f76d66f8e307a5eaea3f
 +Subproject commit 7e6c4cc3cad3fce9b3dedfe46a9d195d616e8ffa
 diff --git a/third_party/flashattn b/third_party/flashattn
 index 581e48aa69..749aca3807 160000
diff --git a/backends/metax_gpu/runtime/process_cupti_data.cc b/backends/metax_gpu/runtime/process_cupti_data.cc
index 65011e3f58d..94caca5d8cb 100755
--- a/backends/metax_gpu/runtime/process_cupti_data.cc
+++ b/backends/metax_gpu/runtime/process_cupti_data.cc
@@ -226,52 +226,126 @@ class CuptiRuntimeCbidStr {
 CuptiRuntimeCbidStr::CuptiRuntimeCbidStr() {
 #define REGISTER_RUNTIME_CBID_STR(cbid) \
   cbid_str_[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid
-  REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
-  REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaChooseDevice_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncSetCacheConfig_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetValidDevices_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaSetDeviceFlags_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocPitch_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFreeArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
-  REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaHostGetFlags_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemGetInfo_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy2D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyToArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy2DToArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyToSymbol_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyFromSymbol_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy2DAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyToSymbolAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyFromSymbolAsync_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020);
-  REGISTER_RUNTIME_CBID_STR(
-      cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
-  REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset2D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset2DAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetSymbolAddress_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaGetSymbolSize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaBindTexture2D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaBindTextureToArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
-  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamQuery_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventCreate_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaEventElapsedTime_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMalloc3D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMalloc3DArray_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset3D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemset3DAsync_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3D_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DAsync_v3020);
   REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
-  REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaPointerGetAttributes_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaHostRegister_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaHostUnregister_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceCanAccessPeer_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceEnablePeerAccess_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceDisablePeerAccess_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyPeer_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpyPeerAsync_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DPeer_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemcpy3DPeerAsync_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceReset_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetLimit_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSetLimit_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetCacheConfig_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSetCacheConfig_v3020);
+  REGISTER_RUNTIME_CBID_STR(cudaProfilerInitialize_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaProfilerStart_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaProfilerStop_v4000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetByPCIBusId_v4010);
   REGISTER_RUNTIME_CBID_STR(cudaDeviceGetPCIBusId_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcGetEventHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcOpenEventHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcGetMemHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcOpenMemHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaIpcCloseMemHandle_v4010);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncSetSharedMemConfig_v4020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetSharedMemConfig_v4020);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceSetSharedMemConfig_v4020);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamAddCallback_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamGetPriority_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamGetFlags_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocManaged_v6000);
+  REGISTER_RUNTIME_CBID_STR(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamAttachMemAsync_v6000);
+  REGISTER_RUNTIME_CBID_STR(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050);
+  REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
+  REGISTER_RUNTIME_CBID_STR(cudaGetDeviceFlags_v7000);
+  REGISTER_RUNTIME_CBID_STR(
+      cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemRangeGetAttribute_v8000);
+  REGISTER_RUNTIME_CBID_STR(cudaMemRangeGetAttributes_v8000);
 #if CUDA_VERSION >= 9000
   REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
   REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
+  REGISTER_RUNTIME_CBID_STR(cudaFuncSetAttribute_v9000);
+  REGISTER_RUNTIME_CBID_STR(cudaGraphLaunch_v10000);
+  REGISTER_RUNTIME_CBID_STR(cudaStreamSetAttribute_v11000);
+  REGISTER_RUNTIME_CBID_STR(cudaMallocAsync_v11020);
+  REGISTER_RUNTIME_CBID_STR(cudaFreeAsync_v11020);
 #endif
 #undef REGISTER_RUNTIME_CBID_STR
 }

From 352f02e869be9bccd1c9d154d2c70151626a43ea Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 9 Sep 2025 16:45:38 +0800
Subject: [PATCH 03/95] [Metax] fix dgc & mklml compile product path problem
 (#8)

---
 backends/metax_gpu/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 5022e1bdde3..beb442eadad 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -26,6 +26,10 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")
 set(WITH_MKLML ON)
 
+set(THIRD_PARTY_PATH
+    "${PADDLE_SOURCE_DIR}/build/third_party"
+    CACHE PATH "Third party libraries directory.")
+
 include(paddle)
 include(version)
 include(generic)
@@ -52,10 +56,6 @@ option(ON_INFER "compile with inference c++ lib" OFF)
 option(WITH_GPU "Compile PaddlePaddle with METAX_GPU" ON)
 option(WITH_CUSTOM_DEVICE "Compile PaddlePaddle with CUSTOM_DEVICE" ON)
 
-set(THIRD_PARTY_PATH
-    "${PADDLE_SOURCE_DIR}/build/third_party"
-    CACHE PATH "Third party libraries directory.")
-
 macro(UNSET_VAR VAR_NAME)
   unset(${VAR_NAME} CACHE)
   unset(${VAR_NAME})

From 8f13faed41890653f7f57328674c672c77dcfa4c Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 11 Sep 2025 17:18:33 +0800
Subject: [PATCH 04/95] [Metax] fix accuracy kernel & add
 test_accuracy_op_metax.py unit test (#9)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] fix accuracy kernel & add test_accuracy_op_metax.py unit test

* [Metax] add mixed_vector fix & update change patch
---
 backends/metax_gpu/CMakeLists.txt             |   2 +-
 backends/metax_gpu/build.sh                   |  26 +-
 backends/metax_gpu/build_in_metax.sh          |  17 +-
 backends/metax_gpu/change_patch.sh            |   9 +-
 .../cuda_kernels/accuracy_kernel_register.cu  | 141 ++-
 backends/metax_gpu/patch/tmp/mixed_vector.cc  | 111 ++
 backends/metax_gpu/patch/tmp/mixed_vector.h   | 413 ++++++++
 .../tests/unittest/test_accuracy_op_metax.py  | 206 ++++
 .../tests/unittest/test_gather_op_metax.py    | 983 +++++++++++++++---
 9 files changed, 1740 insertions(+), 168 deletions(-)
 create mode 100644 backends/metax_gpu/patch/tmp/mixed_vector.cc
 create mode 100644 backends/metax_gpu/patch/tmp/mixed_vector.h
 create mode 100644 backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index beb442eadad..4567723123c 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -128,7 +128,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/arange_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/adadelta_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_check_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_kernel.cu
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/accuracy_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/allclose_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/all_gather_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/all_reduce_kernel.cu
diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index 0350a32521f..dd0ab3aab90 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -31,25 +31,7 @@ git submodule sync --recursive && git submodule update --init --recursive
 
 
 # apply patch
-
-rm -r ../../Paddle/third_party/eigen3
-
-
-cd patch 
-
-unzip mcEigen_3.4.0_paddle_final.zip
-
-mv mcEigen_3.4.0_paddle_final eigen3
-
-cd ..
-
-cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
-
-cd ../../Paddle/
-
-git apply --verbose ../backends/metax_gpu/patch/paddle.patch
-
-cd -
+bash change_patch.sh
 
 
 export MACA_PATH=/opt/maca
diff --git a/backends/metax_gpu/build_in_metax.sh b/backends/metax_gpu/build_in_metax.sh
index b1f9d63d85c..67ec1a2c31c 100644
--- a/backends/metax_gpu/build_in_metax.sh
+++ b/backends/metax_gpu/build_in_metax.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,16 +22,7 @@ git submodule sync --recursive && git submodule update --init --recursive
 
 
 # apply patch
-
-rm -r ../../Paddle/third_party/eigen3
-cd patch 
-unzip mcEigen_3.4.0_paddle_final.zip
-mv mcEigen_3.4.0_paddle_final eigen3
-cd ..
-cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
-cd ../../Paddle/
-git apply --verbose ../backends/metax_gpu/patch/paddle.patch
-cd -
+bash change_patch.sh
 
 export MACA_PATH=/opt/maca
 export CUDA_PATH=/workspace/cuda-11.7/
diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh
index 58bda1aacd4..833ae00f6bd 100644
--- a/backends/metax_gpu/change_patch.sh
+++ b/backends/metax_gpu/change_patch.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,11 +16,12 @@
 # limitations under the License.
 
 rm -r ../../Paddle/third_party/eigen3
-cd patch 
+cd patch
 unzip mcEigen_3.4.0_paddle_final.zip
 mv mcEigen_3.4.0_paddle_final eigen3
 cd ..
 cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
+cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core
 cd ../../Paddle/
 git apply --verbose ../backends/metax_gpu/patch/paddle.patch
 cd -
diff --git a/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu
index 1b26e5711ac..0d61c79d0fa 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/accuracy_kernel_register.cu
@@ -1,7 +1,7 @@
 // 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
 // Reserved.
 
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,19 +14,150 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/accuracy_kernel.h"
 
+namespace phi {
+using phi::PADDLE_CUDA_NUM_THREADS;
+
+template <int BlockSize, typename T>
+__global__ void AccuracyCudaKernel(const int N,
+                                   const int D,
+                                   const int64_t* Xdata,
+                                   const int64_t* labeldata,
+                                   int* correct_data,
+                                   T* accuracy,
+                                   int* total_data) {
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  int count = 0;
+  __shared__ int total[BlockSize];
+
+  // support only 1 block
+  for (int i = threadIdx.x; i < (N); i += BlockSize) {
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] == labeldata[i]) {
+        ++count;
+        break;
+      }
+    }
+  }
+  total[threadIdx.x] = count;
+  __syncthreads();
+
+  // reduce the count with init value 0, and output accuracy.
+  // #ifdef PADDLE_WITH_CUDA
+  //   int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
+  // #else
+  // HIP thrust::reduce not support __device__
+  for (int s = BlockSize / 2; s > 0; s >>= 1) {
+    if (threadIdx.x < s) {
+      total[threadIdx.x] += total[threadIdx.x + s];
+    }
+    __syncthreads();
+  }
+  int result = total[0];
+  // #endif
+  if (threadIdx.x == 0) {
+    *correct_data = result;
+    *accuracy = static_cast<T>(static_cast<MT>(result) / static_cast<MT>(N));
+    *total_data = N;
+  }
+}
+
+template <typename T, typename Context>
+void AccuracyKernel(const Context& dev_ctx,
+                    const DenseTensor& inference,
+                    const DenseTensor& indices,
+                    const DenseTensor& label,
+                    DenseTensor* accuracy,
+                    DenseTensor* correct,
+                    DenseTensor* total) {
+  // FIXME(typhoonzero): only support indices currently
+  // if add support for output values, how to detect the data type?
+  const int64_t* indices_data = indices.data<int64_t>();
+  const int64_t* label_data = label.data<int64_t>();
+
+  PADDLE_ENFORCE_EQ(
+      inference.dims().size(),
+      2,
+      common::errors::InvalidArgument(
+          "Rank(Input) of AccuracyOp must be 2, with shape "
+          "[sample_number, class_dim], But received rank(Input) is %d",
+          inference.dims().size()));
+
+  int* correct_data = dev_ctx.template Alloc<int>(correct);
+  int* total_data = dev_ctx.template Alloc<int>(total);
+  T* accuracy_data = dev_ctx.template Alloc<T>(accuracy);
+
+  int num_samples = static_cast<int>(inference.dims()[0]);
+  size_t infer_width = inference.dims()[1];
+  auto stream = dev_ctx.stream();
+  phi::backends::gpu::GpuMemsetAsync(accuracy_data, 0, sizeof(T), stream);
+
+  PADDLE_ENFORCE_GT(label.dims().size(),
+                    0,
+                    common::errors::InvalidArgument(
+                        "Rank(Label) of AccuracyOp must greater than 0, "
+                        "But received rank(Label) is %d",
+                        label.dims().size()));
+
+  PADDLE_ENFORCE_GE(label.dims()[0],
+                    inference.dims()[0],
+                    common::errors::InvalidArgument(
+                        "num_samples(%d) of Label should less than "
+                        "or equal to num_samples(%d) of Input",
+                        label.dims()[0],
+                        num_samples));
+
+  if (num_samples == 0) {
+    return;
+  }
+
+  AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS, T>
+      <<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(num_samples,
+                                                  infer_width,
+                                                  indices_data,
+                                                  label_data,
+                                                  correct_data,
+                                                  accuracy_data,
+                                                  total_data);
+}
+}  // namespace phi
+
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
+PD_REGISTER_KERNEL(accuracy,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AccuracyKernel,
+                   phi::float16,
+                   phi::bfloat16,
+                   float,
+                   double) {
+  kernel->InputAt(1).SetDataType(phi::DataType::INT64);
+  kernel->InputAt(2).SetDataType(phi::DataType::INT64);
+  kernel->OutputAt(1).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::INT32);
+}
+
 PD_CUSTOM_KERNEL_REGISTER(accuracy,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::AccuracyKernel,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
+                          phi::float16,
+                          phi::bfloat16,
                           float,
                           double) {
-  kernel->InputAt(1).SetDataType(phi::DataType::INT32);
-  kernel->InputAt(2).SetDataType(phi::DataType::INT32);
+  kernel->InputAt(1).SetDataType(phi::DataType::INT64);
+  kernel->InputAt(2).SetDataType(phi::DataType::INT64);
   kernel->OutputAt(1).SetDataType(phi::DataType::INT32);
   kernel->OutputAt(2).SetDataType(phi::DataType::INT32);
 }
diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.cc b/backends/metax_gpu/patch/tmp/mixed_vector.cc
new file mode 100644
index 00000000000..a90113c7977
--- /dev/null
+++ b/backends/metax_gpu/patch/tmp/mixed_vector.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/mixed_vector.h"
+
+#include <algorithm>
+#include <initializer_list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/common/memory_utils.h"
+#include "paddle/utils/none.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <typename T>
+void CopyToCPUHelper(std::vector<T> *cpu_,
+                     phi::Allocator::AllocationPtr *gpu_,
+                     size_t *gpu_memory_size_) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // COPY GPU Data To CPU
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
+      phi::DeviceContextPool::Instance().Get((*gpu_)->place()));
+  auto stream = dev_ctx->stream();
+  void *src = (*gpu_)->ptr();
+  void *dst = cpu_->data();
+  auto place = dev_ctx->GetPlace();
+  if (place.GetType() == phi::AllocationType::GPU) {
+    memory_utils::Copy(phi::CPUPlace(),
+                       dst,
+                       OptionalCUDAPlace(*gpu_).get(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  } else {
+    memory_utils::Copy(phi::CPUPlace(),
+                       dst,
+                       OptionalCustomPlace(*gpu_).get(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  }
+  dev_ctx->Wait();
+#endif
+}
+
+template <typename T>
+void CopyCPUDataToCUDAHelper(std::vector<T> *cpu_,
+                             phi::Allocator::AllocationPtr *gpu_,
+                             size_t *gpu_memory_size_,
+                             const phi::Place &place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void *src = cpu_->data();
+  *gpu_memory_size_ = cpu_->size() * sizeof(T);  // sizeof(T)
+  (*gpu_) = memory_utils::Alloc(place, *gpu_memory_size_);
+  void *dst = (*gpu_)->ptr();
+  auto *dev_ctx = static_cast<phi::GPUContext *>(
+      phi::DeviceContextPool::Instance().Get(place));
+  auto stream = dev_ctx->stream();
+  if (place.GetType() == phi::AllocationType::GPU) {
+    memory_utils::Copy(OptionalCUDAPlace(*gpu_).get(),
+                       dst,
+                       phi::CPUPlace(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  } else {
+    memory_utils::Copy(OptionalCustomPlace(*gpu_).get(),
+                       dst,
+                       phi::CPUPlace(),
+                       src,
+                       *gpu_memory_size_,
+                       stream);
+  }
+  dev_ctx->Wait();
+#endif
+}
+
+#define INSTANTIATE_VECTOR_FOR_TYPE(__TYPE__)                                 \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyToCPU() const {                   \
+    CopyToCPUHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_);                \
+  }                                                                           \
+                                                                              \
+  template <>                                                                 \
+  void MixVector<__TYPE__>::VectorData::CopyCPUDataToCUDA(                    \
+      const phi::Place &place) const {                                        \
+    CopyCPUDataToCUDAHelper<__TYPE__>(cpu_, &gpu_, &gpu_memory_size_, place); \
+  }
+
+INSTANTIATE_VECTOR_FOR_TYPE(size_t)
+INSTANTIATE_VECTOR_FOR_TYPE(int)
+INSTANTIATE_VECTOR_FOR_TYPE(int64_t)
+
+};  // namespace phi
diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.h b/backends/metax_gpu/patch/tmp/mixed_vector.h
new file mode 100644
index 00000000000..e7cf1e626c9
--- /dev/null
+++ b/backends/metax_gpu/patch/tmp/mixed_vector.h
@@ -0,0 +1,413 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <initializer_list>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
+#include <vector>
+
+#include "glog/logging.h"
+#include "paddle/common/errors.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/utils/none.h"
+#include "paddle/utils/optional.h"
+
+namespace phi {
+
+template <class T>
+using Vector = std::vector<T>;
+
+inline paddle::optional<phi::GPUPlace> OptionalCUDAPlace(
+    const phi::Allocator::AllocationPtr &gpu_) {
+  return gpu_ == nullptr ? paddle::none
+                         : paddle::optional<phi::GPUPlace>(gpu_->place());
+}
+
+inline paddle::optional<phi::CustomPlace> OptionalCustomPlace(
+    const phi::Allocator::AllocationPtr &gpu_) {
+  return gpu_ == nullptr ? paddle::none
+                         : paddle::optional<phi::CustomPlace>(gpu_->place());
+}
+
+// Vector<T> implements the std::vector interface, and can get Data or
+// MutableData from any place. The data will be synced implicitly inside.
+template <typename T>
+class MixVector {
+ public:
+  using value_type = T;
+  using iterator = typename std::vector<T>::iterator;
+  using const_iterator = typename std::vector<T>::const_iterator;
+
+ private:
+  // The actual class to implement vector logic
+  class VectorData {
+   public:
+    template <typename U>
+    explicit VectorData(std::vector<U> *dat) : cpu_(dat), flag_(kDataInCPU) {}
+    ~VectorData() {}
+
+    VectorData(const VectorData &o) = delete;
+
+    VectorData &operator=(const VectorData &o) = delete;
+
+    T &operator[](size_t i) {
+      MutableCPU();
+      return (*cpu_)[i];
+    }
+
+    const T &operator[](size_t i) const {
+      ImmutableCPU();
+      return (*cpu_)[i];
+    }
+
+    size_t size() const { return (*cpu_).size(); }
+
+    iterator begin() {
+      MutableCPU();
+      return (*cpu_).begin();
+    }
+
+    iterator end() {
+      MutableCPU();
+      return (*cpu_).end();
+    }
+
+    T &front() {
+      MutableCPU();
+      return (*cpu_).front();
+    }
+
+    T &back() {
+      MutableCPU();
+      return (*cpu_).back();
+    }
+
+    const_iterator begin() const {
+      ImmutableCPU();
+      return (*cpu_).begin();
+    }
+
+    const_iterator end() const {
+      ImmutableCPU();
+      return (*cpu_).end();
+    }
+
+    const T &back() const {
+      ImmutableCPU();
+      return (*cpu_).back();
+    }
+
+    T *data() { return cpu_->data(); }
+
+    const T *data() const { return cpu_->data(); }
+
+    const T &front() const {
+      ImmutableCPU();
+      return (*cpu_).front();
+    }
+
+    // assign this from iterator.
+    // NOTE: the iterator must support `end-begin`
+    template <typename Iter>
+    void assign(Iter begin, Iter end) {
+      MutableCPU();
+      (*cpu_).assign(begin, end);
+    }
+
+    // push_back. If the previous capacity is not enough, the memory will
+    // double.
+    void push_back(T elem) {
+      MutableCPU();
+      (*cpu_).push_back(elem);
+    }
+
+    // extend a vector by iterator.
+    // NOTE: the iterator must support end-begin
+    template <typename It>
+    void Extend(It begin, It end) {
+      MutableCPU();
+      auto out_it = std::back_inserter<std::vector<T>>(*(this->cpu_));
+      std::copy(begin, end, out_it);
+    }
+
+    // resize the vector
+    void resize(size_t size) {
+      MutableCPU();
+      (*cpu_).resize(size);
+    }
+
+    // get cuda ptr. immutable
+    const T *CUDAData(phi::Place place) const {
+      PADDLE_ENFORCE_EQ(
+          place.GetType() == phi::AllocationType::GPU ||
+              place.GetType() == phi::AllocationType::CUSTOM,
+          true,
+          common::errors::Unavailable(
+              "Place mismatch, CUDA Data must be on CUDA place."));
+      ImmutableCUDA(place);
+      return reinterpret_cast<T *>(gpu_->ptr());
+    }
+
+    // get cuda ptr. mutable
+    T *CUDAMutableData(phi::Place place) {
+      const T *ptr = CUDAData(place);
+      flag_ = kDirty | kDataInCUDA;
+      return const_cast<T *>(ptr);
+    }
+
+    // clear
+    void clear() {
+      (*cpu_).clear();
+      flag_ = kDirty | kDataInCPU;
+    }
+
+    std::vector<T> *get_vector() { return cpu_; }
+
+    size_t capacity() const { return (*cpu_).capacity(); }
+
+    // reserve data
+    void reserve(size_t size) const { (*cpu_).reserve(size); }
+
+    std::mutex &Mutex() const { return mtx_; }
+
+    paddle::optional<phi::GPUPlace> CUDAPlace() const {
+      return OptionalCUDAPlace(gpu_);
+    }
+
+    paddle::optional<phi::CustomPlace> CustomPlace() const {
+      return OptionalCustomPlace(gpu_);
+    }
+
+    void MutableCPU() {
+      if (IsInCUDA() && IsDirty()) {
+        CopyToCPU();
+      }
+      flag_ = kDirty | kDataInCPU;
+    }
+
+   private:
+    enum DataFlag {
+      kDataInCPU = 0x01,
+      kDataInCUDA = 0x02,
+      // kDirty means the data has been changed in one device.
+      kDirty = 0x10
+    };
+
+    void CopyToCPU() const;
+
+    void ImmutableCUDA(phi::Place place) const {
+      if (IsDirty()) {
+        if (IsInCPU()) {
+          CopyCPUDataToCUDA(place);
+          UnsetFlag(kDirty);
+          SetFlag(kDataInCUDA);
+        } else if (IsInCUDA() && !(place == gpu_->place())) {
+          PADDLE_THROW(
+              common::errors::Unavailable("Unexpected data place mismatch."));
+          // Still dirty
+        } else {
+          // Dirty && DataInCUDA && Device is same
+          // Do nothing
+        }
+      } else {
+        if (!IsInCUDA()) {
+          // Even data is not dirty. However, data is not in CUDA. Copy data.
+          CopyCPUDataToCUDA(place);
+          SetFlag(kDataInCUDA);
+        } else if (!(place == gpu_->place())) {
+          PADDLE_THROW(
+              common::errors::Unavailable("Unexpected data place mismatch."));
+        } else {
+          // Not Dirty && DataInCUDA && Device is same
+          // Do nothing.
+        }
+      }
+    }
+
+    void CopyCPUDataToCUDA(const phi::Place &place) const;
+
+    void ImmutableCPU() const {
+      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
+                                      // CPU has no data.
+        CopyToCPU();
+        UnsetFlag(kDirty);
+      }
+      SetFlag(kDataInCPU);
+    }
+
+    void UnsetFlag(int flag) const { flag_ &= ~flag; }
+    void SetFlag(int flag) const { flag_ |= flag; }
+
+    bool IsDirty() const { return flag_ & kDirty; }
+
+    bool IsInCUDA() const { return flag_ & kDataInCUDA; }
+
+    bool IsInCPU() const { return flag_ & kDataInCPU; }
+
+    std::vector<T> *cpu_;
+    mutable phi::Allocator::AllocationPtr gpu_;
+    mutable size_t gpu_memory_size_{0};
+    mutable int flag_;
+
+    mutable std::mutex mtx_;
+  };
+
+ public:
+  // implicit cast from std::vector.
+  template <typename U>
+  MixVector(const std::vector<U> *dat) {  // NOLINT
+    m_.reset(new VectorData(const_cast<std::vector<U> *>(dat)));
+  }
+
+  // Copy ctor
+  MixVector(const MixVector<T> &other) = delete;
+
+  // Copy operator
+  MixVector<T> &operator=(const MixVector<T> &other) = delete;
+
+  // Move ctor
+  MixVector(MixVector<T> &&other) = delete;
+
+  // CPU data access method. Mutable.
+  T &operator[](size_t i) { return (*m_)[i]; }
+
+  // CPU data access method. Immutable.
+  const T &operator[](size_t i) const { return (*m_)[i]; }
+
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return m_->size(); }
+
+  iterator begin() { return m_->begin(); }
+
+  iterator end() { return m_->end(); }
+
+  T &front() { return m_->front(); }
+
+  T &back() { return m_->back(); }
+
+  const_iterator begin() const { return m_->begin(); }
+
+  const_iterator end() const { return m_->end(); }
+
+  const_iterator cbegin() const { return begin(); }
+
+  const_iterator cend() const { return end(); }
+
+  const T &back() const { return m_->back(); }
+
+  T *data() { return m_->data(); }
+
+  const T *data() const { return m_->data(); }
+
+  const T &front() const { return m_->front(); }
+  // end of std::vector iterator methods
+
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    m_->assign(begin, end);
+  }
+
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) { m_->push_back(elem); }
+
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    m_->Extend(begin, end);
+  }
+
+  // resize the vector
+  void resize(size_t size) {
+    if (m_->size() != size) {
+      m_->resize(size);
+    }
+  }
+
+  // get cuda ptr. immutable
+  const T *CUDAData(phi::Place place) const {
+    {
+      phi::GPUPlace p(place.GetDeviceId());
+      auto &mtx = m_->Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_->CUDAPlace();
+      if (cuda_place == paddle::none || cuda_place == p) {
+        return m_->CUDAData(place);
+      }
+    }
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
+    return CUDAData(place);
+  }
+
+  // get cuda ptr. mutable
+  T *CUDAMutableData(phi::Place place) {
+    {
+      phi::GPUPlace p(place.GetDeviceId());
+      auto &mtx = m_->Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_->CUDAPlace();
+      if (cuda_place == paddle::none || cuda_place == p) {
+        return m_->CUDAMutableData(place);
+      }
+    }
+    m_->MutableCPU();
+    m_.reset(new VectorData(m_->get_vector()));
+    return CUDAMutableData(place);
+  }
+
+  // clear
+  void clear() { m_->clear(); }
+
+  size_t capacity() const { return m_->capacity(); }
+
+  // reserve data
+  void reserve(size_t size) { m_->reserve(size); }
+
+  // the unify method to access CPU or CUDA data. immutable.
+  const T *Data(phi::Place place) const {
+    if (place.GetType() == phi::AllocationType::GPU) {
+      return CUDAData(place);
+    } else {
+      return data();
+    }
+  }
+
+  // the unify method to access CPU or CUDA data. mutable.
+  T *MutableData(phi::Place place) {
+    if (place.GetType() == phi::AllocationType::GPU) {
+      return CUDAMutableData(place);
+    } else {
+      return data();
+    }
+  }
+
+  void CopyToCPU() { m_->MutableCPU(); }
+
+  const void *Handle() const { return m_.get(); }
+
+ private:
+  mutable std::unique_ptr<VectorData> m_;
+};
+
+};  // namespace phi
diff --git a/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py b/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py
new file mode 100644
index 00000000000..910ef5cd1a6
--- /dev/null
+++ b/backends/metax_gpu/tests/unittest/test_accuracy_op_metax.py
@@ -0,0 +1,206 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    paddle_static_guard,
+    is_custom_device,
+    get_device_place,
+)
+
+import paddle
+from paddle import base
+from paddle.base import Program, core, program_guard
+
+
+def accuracy_wrapper(infer, indices, label):
+    return paddle._C_ops.accuracy(infer, indices, label)
+
+
+class TestAccuracyOp(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.python_api = accuracy_wrapper
+        self.dtype = np.float32
+        self.init_dtype()
+        n = 8192
+        infer = np.random.random((n, 1)).astype(self.dtype)
+        indices = np.random.randint(0, 2, (n, 1)).astype("int64")
+        label = np.random.randint(0, 2, (n, 1)).astype("int64")
+        self.inputs = {"Out": infer, "Indices": indices, "Label": label}
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            "Accuracy": np.array(num_correct / float(n)).astype(self.dtype),
+            "Correct": np.array(num_correct).astype("int32"),
+            "Total": np.array(n).astype("int32"),
+        }
+
+    def init_dtype(self):
+        pass
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+
+class TestAccuracyOpFp16(TestAccuracyOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3, check_pir=True)
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device())
+    or not core.is_bfloat16_supported(get_device_place()),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestAccuracyOpBf16(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        self.python_api = accuracy_wrapper
+        self.init_dtype()
+        n = 8192
+        infer = np.random.random((n, 1)).astype(np.float32)
+        indices = np.random.randint(0, 2, (n, 1)).astype("int64")
+        label = np.random.randint(0, 2, (n, 1)).astype("int64")
+        self.inputs = {
+            "Out": convert_float_to_uint16(infer),
+            "Indices": indices,
+            "Label": label,
+        }
+        num_correct = 0
+        for rowid in range(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            "Accuracy": convert_float_to_uint16(
+                np.array(num_correct / float(n)).astype(np.float32)
+            ),
+            "Correct": np.array(num_correct).astype("int32"),
+            "Total": np.array(n).astype("int32"),
+        }
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda() or is_custom_device():
+            place = get_device_place()
+            self.check_output_with_place(place, atol=1e-2, check_pir=True)
+
+
+class TestAccuracyOpError(unittest.TestCase):
+    def test_type_errors(self):
+        with (
+            paddle_static_guard(),
+            program_guard(Program(), Program()),
+        ):
+            # The input type of accuracy_op must be Variable.
+            x1 = base.create_lod_tensor(np.array([[-1]]), [[1]], base.CPUPlace())
+            label = paddle.static.data(name="label", shape=[-1, 1], dtype="int32")
+            self.assertRaises(TypeError, paddle.static.accuracy, x1, label)
+            self.assertRaises(TypeError, paddle.metric.accuracy, x1, label)
+            # The input dtype of accuracy_op must be float32 or float64.
+            x2 = paddle.static.data(name="x2", shape=[-1, 4], dtype="int32")
+            self.assertRaises(TypeError, paddle.static.accuracy, x2, label)
+            self.assertRaises(TypeError, paddle.metric.accuracy, x2, label)
+
+            x3 = paddle.static.data(name="input", shape=[-1, 2], dtype="float32")
+            paddle.static.accuracy(input=x3, label=label)
+            paddle.metric.accuracy(input=x3, label=label)
+
+    def test_value_errors(self):
+        with (
+            program_guard(Program(), Program()),
+            # The input rank of accuracy_op must be 2.
+            self.assertRaises(ValueError),
+        ):
+            x3 = paddle.to_tensor([0.1], dtype="float32")
+            label3 = paddle.to_tensor(np.reshape([0], [1, 1]), dtype="int32")
+            paddle.metric.accuracy(x3, label3)
+
+
+class TestAccuracyAPI1(unittest.TestCase):
+    def run_api(self, accuracy_api):
+        with (
+            paddle_static_guard(),
+            paddle.static.program_guard(paddle.static.Program()),
+        ):
+            self.predictions = paddle.static.data(
+                shape=[2, 5], name="predictions", dtype="float32"
+            )
+            self.label = paddle.static.data(shape=[2, 1], name="labels", dtype="int64")
+            self.result = accuracy_api(input=self.predictions, label=self.label, k=1)
+            self.input_predictions = np.array(
+                [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
+                dtype="float32",
+            )
+            self.input_labels = np.array([[2], [0]], dtype="int64")
+            self.expect_value = np.array([0.5], dtype="float32")
+            exe = paddle.static.Executor()
+            (result,) = exe.run(
+                feed={
+                    "predictions": self.input_predictions,
+                    "labels": self.input_labels,
+                },
+                fetch_list=[self.result],
+            )
+            self.assertEqual((result == self.expect_value).all(), True)
+
+    def test_api(self):
+        self.run_api(accuracy_api=paddle.static.accuracy)
+        self.run_api(accuracy_api=paddle.metric.accuracy)
+
+
+class TestAccuracyAPI2(unittest.TestCase):
+    def test_api(self):
+        with base.dygraph.guard():
+            predictions = paddle.to_tensor(
+                [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
+                dtype="float32",
+            )
+            label = paddle.to_tensor([[2], [0]], dtype="int64")
+            result = paddle.static.accuracy(input=predictions, label=label, k=1)
+            expect_value = np.array([0.5], dtype="float32")
+            self.assertEqual((result.numpy() == expect_value).all(), True)
+
+
+class TestAccuracyAPI(unittest.TestCase):
+    def test_api(self):
+        with base.dygraph.guard():
+            predictions = paddle.to_tensor(
+                [[0.2, 0.1, 0.4, 0.1, 0.1], [0.2, 0.3, 0.1, 0.15, 0.25]],
+                dtype="float32",
+            )
+            label = paddle.to_tensor([[2], [0]], dtype="int64")
+            result = paddle.metric.accuracy(input=predictions, label=label, k=1)
+            expect_value = np.array([0.5], dtype="float32")
+
+            self.assertEqual((result.numpy() == expect_value).all(), True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unittest/test_gather_op_metax.py b/backends/metax_gpu/tests/unittest/test_gather_op_metax.py
index bdf116571f7..3ce39588838 100644
--- a/backends/metax_gpu/tests/unittest/test_gather_op_metax.py
+++ b/backends/metax_gpu/tests/unittest/test_gather_op_metax.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
 import unittest
-from op_test import OpTest
 
 import numpy as np
-import paddle
+from op_test import (
+    OpTest,
+    convert_float_to_uint16,
+    get_devices,
+    is_custom_device,
+    get_device_place,
+)
+from utils import dygraph_guard
 
-paddle.enable_static()
+import paddle
+from paddle import base
+from paddle.base.dygraph.base import switch_to_static_graph
+from paddle.framework import core
 
 
 def gather_numpy(x, index, axis):
@@ -32,29 +40,119 @@ def gather_numpy(x, index, axis):
 class TestGatherOp(OpTest):
     def setUp(self):
         self.op_type = "gather"
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-        self.__class__.use_custom_device = True
         self.python_api = paddle.gather
+        self.public_python_api = paddle.gather
         self.config()
-        xnp = np.random.random(self.x_shape).astype(self.x_type)
-        self.inputs = {"X": xnp, "Index": np.array(self.index).astype(self.index_type)}
-        self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]}
+        self.prim_op_type = "prim"
+        self.init_inputs_and_outputs()
+        self.if_enable_cinn()
 
     def test_check_output(self):
-        self.check_output_with_place(self.place)
+        self.check_output(check_pir=True, check_symbol_infer=False)
 
     def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X"], "Out")
+        self.check_grad(["X"], "Out", check_pir=True, check_prim_pir=True)
 
     def config(self):
         """
         For multi-dimension input
         """
         self.x_shape = (10, 20)
-        self.x_type = "float32"
+        self.config_dtype()
         self.index = [1, 3, 5]
         self.index_type = "int32"
 
+    def config_dtype(self):
+        self.x_type = "float64"
+
+    def init_inputs_and_outputs(self):
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        if self.x_type == "complex64" or self.x_type == "cpmolex128":
+            xnp = (
+                np.random.randint(-10, 10, size=(10, 10))
+                + 1j * np.random.randint(-10, 10, size=(10, 10))
+            ).astype(self.x_type)
+        self.inputs = {
+            "X": xnp,
+            "Index": np.array(self.index).astype(self.index_type),
+        }
+        self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]}
+
+    def if_enable_cinn(self):
+        pass
+
+
+class TestGatherOp_ZeroDim(TestGatherOp):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = 100
+        self.config_dtype()
+        self.index = 2
+        self.index_type = "int32"
+
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
+
+class TestGatherOpFP16(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+# @unittest.skipIf(
+#     not (core.is_compiled_with_cuda() or is_custom_device())
+#     # or core.cudnn_version() < 8100
+#     # or paddle.device.cuda.get_device_capability()[0] < 8,
+#     # "only support compiled with CUDA and cudnn version need larger than 8.1.0 and device's compute capability is at least 8.0",
+# )
+class TestGatherOpBFP16(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "float32"
+        self.dtype = np.uint16
+
+    def init_inputs_and_outputs(self):
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            "X": convert_float_to_uint16(xnp),
+            "Index": np.array(self.index).astype(self.index_type),
+        }
+        self.outputs = {"Out": convert_float_to_uint16(xnp[self.inputs["Index"]])}
+
+    def if_enable_cinn(self):
+        self.enable_cinn = False
+
+    def test_check_output(self):
+        self.check_output_with_place(
+            place=get_device_place(), check_pir=True, check_symbol_infer=False
+        )
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            get_device_place(),
+            ["X"],
+            "Out",
+            check_pir=True,
+            check_prim_pir=True,
+        )
+
+
+class TestGatherOpComplex64(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOpComplex128(TestGatherOp):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
 
 class TestCase1(TestGatherOp):
     def config(self):
@@ -62,10 +160,42 @@ def config(self):
         For one dimension input
         """
         self.x_shape = 100
-        self.x_type = "float32"
+        self.config_dtype()
         self.index = [1, 3, 5]
         self.index_type = "int32"
 
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase1FP16(TestCase1):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase1BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = 100
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+
+
+class TestCase1Complex64(TestCase1):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase1Complex128(TestCase1):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
 
 class TestCase2(TestGatherOp):
     def config(self):
@@ -73,42 +203,574 @@ def config(self):
         For int64_t index type
         """
         self.x_shape = 100
-        self.x_type = "float32"
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase2FP16(TestCase2):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase2BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = 100
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+
+
+class TestCase2Complex64(TestCase2):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase2Complex128(TestCase2):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase3(TestGatherOp):
+    def config(self):
+        """
+        For other input type
+        """
+        self.x_shape = (10, 20)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase3Fp16(TestCase3):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase3BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.config_dtype()
         self.index = [1, 3, 5]
         self.index_type = "int64"
 
 
+class TestCase3Complex64(TestCase3):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase3Complex128(TestCase3):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase4(TestGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase4FP16(TestCase4):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase4BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1]
+        self.index_type = "int32"
+
+
+class TestCase4Complex64(TestCase4):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase4Complex128(TestCase4):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase5(TestGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1, 3]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase5BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": False}
+        self.config_dtype()
+        self.index = [1, 1]
+        self.index_type = "int32"
+
+
+class TestCase5FP16(TestCase5):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase5Complex64(TestCase5):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase5Complex128(TestCase5):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase6(TestGatherOp):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": True}
+        self.config_dtype()
+        self.index = [1, 3]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestCase6FP16(TestCase6):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestCase6BFP16(TestGatherOpBFP16):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.attrs = {"overwrite": True}
+        self.config_dtype()
+        self.index = [1, 3]
+        self.index_type = "int32"
+
+
+class TestGatherBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.dtype = np.uint16
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(np.float32)
+        axis_np = np.array(self.axis).astype(self.axis_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        self.inputs = {
+            "X": convert_float_to_uint16(xnp),
+            "Index": index_np,
+            "Axis": axis_np,
+        }
+        out = gather_numpy(self.inputs["X"], index_np, axis_np[0])
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", numeric_grad_delta=0.5, check_pir=True)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+
+class TestGatherNegativeAxis(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.dtype = np.uint16
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(np.float32)
+        axis_np = np.array(self.axis).astype(self.axis_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        self.inputs = {
+            "X": convert_float_to_uint16(xnp),
+            "Index": index_np,
+            "Axis": axis_np,
+        }
+        out = gather_numpy(self.inputs["X"], index_np, axis_np[0])
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device_place())
+        for place in places:
+            self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        places = [paddle.CPUPlace()]
+        if core.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device_place())
+        for place in places:
+            self.check_grad_with_place(place, ["X"], "Out", numeric_grad_delta=0.5)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (100, 3)
+        self.index = [0, 1, -2]
+        self.index_type = "int32"
+        self.axis = [-1]
+        self.axis_type = "int32"
+
+
+class TestOutOfRangeError(unittest.TestCase):
+    def test_dygraph_forward_and_backward(self):
+        with dygraph_guard():
+            x = paddle.randn([100, 3]).cpu()
+            x.stop_gradient = False
+            y = paddle.gather(
+                x,
+                paddle.to_tensor([0, -2]).cpu(),
+                axis=-1,
+            )
+            grad_x = paddle.grad(y, x)
+
+    def test_dygraph_error(self):
+        with dygraph_guard():
+            # out of lower bound
+            with self.assertRaises(IndexError):
+                _ = paddle.gather(
+                    paddle.randn([100, 3]).cpu(),
+                    paddle.to_tensor([0, -4]).cpu(),
+                    axis=1,
+                )
+            # out of upper bound
+            with self.assertRaises(IndexError):
+                _ = paddle.gather(
+                    paddle.randn([100, 3]).cpu(),
+                    paddle.to_tensor([0, 3]).cpu(),
+                    axis=1,
+                )
+
+
+class TestCase6Complex64(TestCase6):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestCase6Complex128(TestCase6):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp1(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        axis_np = np.array(self.axis).astype(self.index_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        out = gather_numpy(xnp, index_np, axis_np[0])
+        self.inputs = {"X": xnp, "Index": index_np, "Axis": axis_np}
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", check_pir=True)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp1FP16(TestGatherOp1):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp1Complex64(TestGatherOp1):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp1Complex128(TestGatherOp1):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp2(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp2FP16(TestGatherOp2):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp2Complex64(TestGatherOp2):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp2Complex128(TestGatherOp2):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp3(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 88, 10)
+        self.config_dtype()
+        self.index = [1, 3, 5]
+        self.index_type = "int64"
+        self.axis = [2]
+        self.axis_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp3FP16(TestGatherOp3):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp3Complex64(TestGatherOp3):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp3Complex128(TestGatherOp3):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp4(TestGatherOp1):
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 100, 10)
+        self.config_dtype()
+        self.index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        self.index_type = "int64"
+        self.axis = [0]
+        self.axis_type = "int32"
+        self.attrs = {"overwrite": False}
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+
+class TestGatherOp4FP16(TestGatherOp4):
+    def config_dtype(self):
+        self.x_type = "float16"
+
+
+class TestGatherOp4Complex64(TestGatherOp4):
+    def config_dtype(self):
+        self.x_type = "complex64"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp4Complex128(TestGatherOp4):
+    def config_dtype(self):
+        self.x_type = "complex128"
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestGatherOp5(TestGatherOp):
+    def config(self):
+        """
+        Test for negative axis
+        """
+        self.x_shape = (3, 100, 10)
+        self.config_dtype()
+        self.index = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        self.index_type = "int64"
+        self.axis = [-1]
+        self.axis_type = "int32"
+        self.attrs = {"overwrite": False}
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"],
+            "Out",
+            check_pir=True,
+            check_prim_pir=True,
+        )
+
+
+class API_TestGather(unittest.TestCase):
+    def test_out1(self):
+        with base.program_guard(base.Program(), base.Program()):
+            data1 = paddle.static.data("data1", shape=[-1, 2], dtype="float64")
+            index = paddle.static.data("index", shape=[-1, 1], dtype="int64")
+            out = paddle.gather(data1, index)
+            place = base.CPUPlace()
+            exe = base.Executor(place)
+            input = np.array([[1, 2], [3, 4], [5, 6]]).astype("float64")
+            index_1 = np.array([1, 2]).astype("int64")
+            (result,) = exe.run(
+                feed={"data1": input, "index": index_1}, fetch_list=[out]
+            )
+            expected_output = np.array([[3, 4], [5, 6]])
+        np.testing.assert_allclose(result, expected_output, rtol=1e-05)
+
+    def test_out2(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x = paddle.static.data("x", shape=[-1, 2], dtype="float64")
+            index = paddle.static.data("index", shape=[-1, 1], dtype="int32")
+            axis = paddle.static.data("axis", shape=[1], dtype="int32")
+            out = paddle.gather(x, index, axis)
+            place = paddle.CPUPlace()
+            exe = paddle.static.Executor(place)
+            x_np = np.array([[1, 2], [3, 4], [5, 6]]).astype("float64")
+            index_np = np.array([1, 1]).astype("int32")
+            axis_np = np.array([1]).astype("int32")
+            (result,) = exe.run(
+                feed={"x": x_np, "index": index_np, "axis": axis_np},
+                fetch_list=[out],
+            )
+            expected_output = gather_numpy(x_np, index_np, axis_np[0])
+        np.testing.assert_allclose(result, expected_output, rtol=1e-05)
+
+
 class API_TestDygraphGather(unittest.TestCase):
     def test_out1(self):
-        paddle.set_device("metax_gpu")
         paddle.disable_static()
-        input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype("int32")
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
         index_1 = np.array([1, 2])
         input = paddle.to_tensor(input_1)
         index = paddle.to_tensor(index_1)
         output = paddle.gather(input, index)
         output_np = output.numpy()
-        expected_output = np.array([[3, 4], [5, 6]]).astype("int32")
-        np.testing.assert_allclose(output_np, expected_output)
+        expected_output = np.array([[3, 4], [5, 6]])
+        np.testing.assert_allclose(output_np, expected_output, rtol=1e-05)
         paddle.enable_static()
 
     def test_out12(self):
-        paddle.set_device("metax_gpu")
         paddle.disable_static()
-        input_1 = np.array([[1, 2], [3, 4], [5, 6]]).astype("int32")
+        input_1 = np.array([[1, 2], [3, 4], [5, 6]])
         index_1 = np.array([1, 2])
         x = paddle.to_tensor(input_1)
         index = paddle.to_tensor(index_1)
         output = paddle.gather(x, index, axis=0)
         output_np = output.numpy()
         expected_output = gather_numpy(input_1, index_1, axis=0)
-        np.testing.assert_allclose(output_np, expected_output)
+        np.testing.assert_allclose(output_np, expected_output, rtol=1e-05)
         paddle.enable_static()
 
     def test_zero_index(self):
-        paddle.set_device("metax_gpu")
         paddle.disable_static()
-        x = paddle.to_tensor([[1, 2], [3, 4]]).astype("int32")
+        x = paddle.to_tensor([[1, 2], [3, 4]])
         index = paddle.to_tensor(np.array([]).astype("int64"))
         for axis in range(len(x.shape)):
             out = paddle.gather(x, index, axis)
@@ -117,122 +779,197 @@ def test_zero_index(self):
             self.assertEqual(list(out.shape), expected_shape)
         paddle.enable_static()
 
+    def test_large_data(self):
+        if not paddle.is_compiled_with_cuda():
+            return
 
-class TestGathertError(unittest.TestCase):
-    def setUp(self) -> None:
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-        paddle.set_device("metax_gpu:0")
+        x = np.random.rand(226862, 256).astype("float32")
+        index = np.random.randint(-226862, 22682, size=(8859027))
 
-    def test_error1(self):
-        paddle.enable_static()
-        if not paddle.framework.use_pir_api():
+        def test_dygraph():
+            with base.dygraph.guard():
+                gpu_out = paddle.gather(paddle.to_tensor(x), paddle.to_tensor(index))
+                return gpu_out.numpy()
+
+        @switch_to_static_graph
+        def test_static_graph():
             with paddle.static.program_guard(
                 paddle.static.Program(), paddle.static.Program()
             ):
-
-                input_shape = [8, 9, 6]
-                index_shape = [4]
-                x_int8 = paddle.static.data(
-                    shape=input_shape, dtype="int8", name="x_int8"
-                )
-                x_float32 = paddle.static.data(
-                    shape=input_shape, dtype="float32", name="x_float32"
-                )
-                axis = paddle.static.data(shape=[1], dtype="float32", name="axis")
-                index = paddle.static.data(
-                    shape=index_shape, dtype="int32", name="index"
-                )
-                index_float = paddle.static.data(
-                    shape=index_shape, dtype="float32", name="index_float"
+                x_t = paddle.static.data(name="x", dtype=x.dtype, shape=x.shape)
+                index_t = paddle.static.data(
+                    name="index", dtype=index.dtype, shape=index.shape
                 )
+                out_t = paddle.gather(x_t, index_t)
+                feed = {x_t.name: x, index_t.name: index}
+                fetch = [out_t]
 
-                def test_x_type():
-                    paddle.gather(x_int8, index)
+                gpu_exe = paddle.static.Executor(get_device_place())
+                gpu_value = gpu_exe.run(feed=feed, fetch_list=fetch)[0]
+                return gpu_value
 
-                self.assertRaises(TypeError, test_x_type)
+        np.testing.assert_array_equal(test_dygraph(), test_static_graph())
 
-                def test_index_type():
-                    paddle.gather(x_float32, index_float)
 
-                self.assertRaises(TypeError, test_index_type)
+class TestGathertError(unittest.TestCase):
+    def test_error1(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            shape = [8, 9, 6]
+            x = paddle.static.data(shape=shape, dtype="int8", name="x")
+            axis = paddle.static.data(shape=[1], dtype="float32", name="axis")
+            index = paddle.static.data(shape=shape, dtype="int32", name="index")
+            index_float = paddle.static.data(
+                shape=shape, dtype="float32", name="index_float"
+            )
+
+            def test_x_type():
+                paddle.gather(x, index)
+
+            self.assertRaises((TypeError, ValueError), test_x_type)
+
+            def test_index_type():
+                paddle.gather(x, index_float)
+
+            self.assertRaises((TypeError, ValueError), test_index_type)
+
+            def test_axis_dtype():
+                paddle.gather(x, index, axis=1.11)
 
-                def test_axis_dtype():
-                    paddle.gather(x_float32, index, axis=1.11)
+            self.assertRaises((TypeError, ValueError), test_axis_dtype)
 
-                self.assertRaises(TypeError, test_axis_dtype)
+            def test_axis_dtype1():
+                paddle.gather(x, index, axis=axis)
 
-                def test_axis_dtype1():
-                    paddle.gather(x_float32, index, axis=axis)
+            self.assertRaises((TypeError, ValueError), test_axis_dtype1)
 
-                self.assertRaises(TypeError, test_axis_dtype1)
-        else:
-            paddle.set_device("metax_gpu")
-            input_shape = [8, 9, 6]
-            index_shape = [4]
+    def test_error2(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            shape = [8, 9, 6]
+            x = paddle.static.data(shape=shape, dtype="int8", name="x")
+            index = paddle.static.data(shape=shape, dtype="int32", name="mask")
+            index_float = paddle.static.data(
+                shape=shape, dtype="float32", name="index_float"
+            )
+
+            def test_x_type():
+                paddle.gather(x, index)
+
+            self.assertRaises((TypeError, ValueError), test_x_type)
 
             def test_index_type():
-                with paddle.static.program_guard(
-                    paddle.static.Program(), paddle.static.Program()
-                ):
-                    x = paddle.static.data(shape=input_shape, dtype="float32", name="x")
-                    index = paddle.static.data(
-                        shape=index_shape, dtype="float32", name="index_float"
-                    )
-                    out = paddle.gather(x, index)
-                    exe = paddle.static.Executor(place=self.place)
-                    exe.run(paddle.static.default_startup_program())
-                    self.assertRaises(
-                        ValueError,
-                        exe.run,
-                        paddle.static.default_main_program(),
-                        feed={
-                            "x": np.random.random(input_shape).astype("float32"),
-                            "index_float": np.random.random(index_shape).astype(
-                                "float32"
-                            ),
-                        },
-                    )
-
-            def test_axis_scalar_dtype():
-                with paddle.static.program_guard(
-                    paddle.static.Program(), paddle.static.Program()
-                ):
-                    x = paddle.static.data(shape=input_shape, dtype="float32", name="x")
-                    index = paddle.static.data(
-                        shape=index_shape, dtype="int32", name="index"
-                    )
-                    axis = paddle.static.data(shape=[1], dtype="int32", name="axis")
-                    self.assertRaises(TypeError, paddle.gather, x, index, axis=1.11)
-
-            def test_axis_tensor_dtype():
-                with paddle.static.program_guard(
-                    paddle.static.Program(), paddle.static.Program()
-                ):
-                    x = paddle.static.data(shape=input_shape, dtype="float32", name="x")
-                    index = paddle.static.data(
-                        shape=index_shape, dtype="int32", name="index"
-                    )
-                    axis = paddle.static.data(shape=[1], dtype="float32", name="axis")
-                    y = paddle.gather(x, index, axis=axis)
-                    exe = paddle.static.Executor(place=self.place)
-                    exe.run(paddle.static.default_startup_program())
-                    self.assertRaises(
-                        ValueError,
-                        exe.run,
-                        paddle.static.default_main_program(),
-                        feed={
-                            "x": np.random.random(input_shape).astype("float32"),
-                            "index": np.random.randint(0, 8, index_shape).astype(
-                                "int32"
-                            ),
-                            "axis": np.array([1.11]).astype("float32"),
-                        },
-                    )
-
-            test_index_type()
-            test_axis_scalar_dtype()
-            # test_axis_tensor_dtype()
+                paddle.gather(x, index_float)
+
+            self.assertRaises((TypeError, ValueError), test_index_type)
+
+    def test_error3(self):
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            shape = [8, 9, 6]
+            x = paddle.static.data(shape=shape, dtype="int32", name="x")
+            index = paddle.static.data(shape=shape, dtype="int32", name="index")
+
+            def test_axis_minsize():
+                paddle.gather(x, index, axis=-1)
+
+            self.assertRaises(ValueError, test_axis_minsize)
+
+            def test_axis_maxsize():
+                paddle.gather(x, index, axis=512)
+
+            self.assertRaises(ValueError, test_axis_maxsize)
+
+
+class TestCheckOutType(unittest.TestCase):
+    def test_out_type(self):
+        data = paddle.static.data(shape=[16, 10], dtype="int64", name="x")
+        index = paddle.static.data(shape=[4], dtype="int64", name="index")
+        out = paddle.gather(data, index)
+        self.assertTrue(out.dtype == paddle.int64 or out.dtype == core.DataType.INT64)
+
+    def test_pir_out_type(self):
+        with paddle.pir_utils.IrGuard():
+            data = paddle.static.data(shape=[16, 10], dtype="int64", name="x")
+            index = paddle.static.data(shape=[4], dtype="int64", name="index")
+            out = paddle.gather(data, index)
+            self.assertTrue(out.dtype == core.DataType.INT64)
+
+
+class TestGatherBackward(unittest.TestCase):
+    def setUp(self):
+        self.shape = [10, 20]
+        self.dtype = "float32"
+        self.index = (1, 3, 5)
+        self.index_dtype = "int64"
+        self.places = get_devices()
+
+    def test_gather_backward(self):
+        if len(self.places) != 2:
+            return
+        res_list = []
+        x_np = np.random.random(self.shape).astype(self.dtype)
+        index_np = np.array(self.index, dtype=self.index_dtype)
+        grad_out_np = np.random.random(self.shape).astype(self.dtype)
+        for place in self.places:
+            with base.dygraph.guard(place):
+                x = paddle.to_tensor(x_np, dtype=self.dtype)
+                x.stop_gradient = False
+                index = paddle.to_tensor(index_np, dtype=self.index_dtype)
+                out = paddle.gather(x, index, -1)
+                grad_out = paddle.to_tensor(grad_out_np, dtype=self.dtype)
+                (re,) = paddle.grad(
+                    outputs=out,
+                    inputs=x,
+                    grad_outputs=grad_out,
+                )
+                res_list.append(re.numpy())
+        np.testing.assert_allclose(res_list[0], res_list[1])
+
+
+class TestGatherOp_ZeroSize(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.python_api = paddle.gather
+        self.public_python_api = paddle.gather
+        self.config()
+        self.init_inputs_and_outputs()
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", check_pir=True)
+
+    def config(self):
+        self.x_shape = (3, 0, 4)
+        self.config_dtype()
+        self.index = [2]
+        self.index_type = "int32"
+
+    def config_dtype(self):
+        self.x_type = "float64"
+
+    def init_inputs_and_outputs(self):
+        xnp = np.random.random(self.x_shape).astype(self.x_type)
+        self.inputs = {
+            "X": xnp,
+            "Index": np.array(self.index).astype(self.index_type),
+        }
+        self.outputs = {"Out": self.inputs["X"][self.inputs["Index"]]}
+
+
+class TestGatherOp_ZeroSize2(TestGatherOp_ZeroSize):
+    def config(self):
+        self.x_shape = (10, 20)
+        self.config_dtype()
+        self.index = [2, 0]
+        self.index_type = "int32"
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()

From 893829371efacbff859d0eb83c7ea827f5bb0124 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 11 Sep 2025 17:29:10 +0800
Subject: [PATCH 05/95] [Metax] update metax_gpu CMakeLists.txt (#10)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] fix accuracy kernel & add test_accuracy_op_metax.py unit test

* [Metax] add mixed_vector fix & update change patch

* [Metax] update metax_gpu CMakeLists.txt
---
 backends/metax_gpu/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 4567723123c..b22d7077e3b 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -26,11 +26,11 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")
 set(WITH_MKLML ON)
 
+include(paddle)
 set(THIRD_PARTY_PATH
     "${PADDLE_SOURCE_DIR}/build/third_party"
     CACHE PATH "Third party libraries directory.")
 
-include(paddle)
 include(version)
 include(generic)
 include(cblas)

From f54187fb3e47ed8062537b9d339c48c7fd711326 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 11 Sep 2025 18:51:43 +0800
Subject: [PATCH 06/95] [metax] updata_qr_kernel (#11)

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .../metax_kernel/qr_kernel_register.cu        | 207 +++++++++---------
 1 file changed, 98 insertions(+), 109 deletions(-)

diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
index 7b133371f4d..745069e2eda 100644
--- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
@@ -22,9 +22,8 @@
 #include <algorithm>
 #include <vector>
 
-#include "kernels/impl/values_vectors_functor.h"
+#include "kernels/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -39,7 +38,6 @@
 #include "paddle/phi/kernels/slice_kernel.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
 #include "paddle/phi/kernels/tril_triu_kernel.h"
-
 namespace phi {
 
 template <class T, class Context>
@@ -358,47 +356,47 @@ void QrKernel(const Context& dev_ctx,
 
 #ifdef PADDLE_WITH_HIP
 #define FUNC_WITH_TYPES(m) m(float, s) m(double, d)
-#define GEQRF_BATCH_INSTANCE(T, C)                              \
-  template <>                                                   \
-  void BatchedGeqrf<GPUContext, T>(const GPUContext& dev_ctx,   \
-                                   int batch_size,              \
-                                   int m,                       \
-                                   int n,                       \
-                                   T* a,                        \
-                                   int lda,                     \
-                                   T* tau,                      \
-                                   int a_stride,                \
-                                   int tau_stride) {            \
-    auto handle = dev_ctx.cusolver_dn_handle();                 \
-    for (int i = 0; i < batch_size; ++i) {                      \
-      T* a_working_ptr = &a[i * a_stride];                      \
-      T* tau_working_ptr = &tau[i * tau_stride];                \
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##geqrf( \
-          handle, m, n, a_working_ptr, lda, tau_working_ptr));  \
-    }                                                           \
+#define GEQRF_BATCH_INSTANCE(T, C)                                           \
+  template <>                                                                \
+  void BatchedGeqrf<GPUContext, T>(const GPUContext& dev_ctx,                \
+                                   int batch_size,                           \
+                                   int m,                                    \
+                                   int n,                                    \
+                                   T* a,                                     \
+                                   int lda,                                  \
+                                   T* tau,                                   \
+                                   int a_stride,                             \
+                                   int tau_stride) {                         \
+    auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); \
+    for (int i = 0; i < batch_size; ++i) {                                   \
+      T* a_working_ptr = &a[i * a_stride];                                   \
+      T* tau_working_ptr = &tau[i * tau_stride];                             \
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##geqrf(              \
+          handle, m, n, a_working_ptr, lda, tau_working_ptr));               \
+    }                                                                        \
   }
 
 FUNC_WITH_TYPES(GEQRF_BATCH_INSTANCE);
 
-#define ORGQR_BATCH_INSTANCE(T, C)                                \
-  template <>                                                     \
-  void BatchedOrgqr<GPUContext, T>(const GPUContext& dev_ctx,     \
-                                   int batch_size,                \
-                                   int m,                         \
-                                   int n,                         \
-                                   int k,                         \
-                                   T* a,                          \
-                                   int lda,                       \
-                                   T* tau,                        \
-                                   int a_stride,                  \
-                                   int tau_stride) {              \
-    auto handle = dev_ctx.cusolver_dn_handle();                   \
-    for (int i = 0; i < batch_size; ++i) {                        \
-      T* a_working_ptr = &a[i * a_stride];                        \
-      T* tau_working_ptr = &tau[i * tau_stride];                  \
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##orgqr(   \
-          handle, m, n, k, a_working_ptr, lda, tau_working_ptr)); \
-    }                                                             \
+#define ORGQR_BATCH_INSTANCE(T, C)                                           \
+  template <>                                                                \
+  void BatchedOrgqr<GPUContext, T>(const GPUContext& dev_ctx,                \
+                                   int batch_size,                           \
+                                   int m,                                    \
+                                   int n,                                    \
+                                   int k,                                    \
+                                   T* a,                                     \
+                                   int lda,                                  \
+                                   T* tau,                                   \
+                                   int a_stride,                             \
+                                   int tau_stride) {                         \
+    auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace()); \
+    for (int i = 0; i < batch_size; ++i) {                                   \
+      T* a_working_ptr = &a[i * a_stride];                                   \
+      T* tau_working_ptr = &tau[i * tau_stride];                             \
+      PADDLE_ENFORCE_GPU_SUCCESS(dynload::rocsolver_##C##orgqr(              \
+          handle, m, n, k, a_working_ptr, lda, tau_working_ptr));            \
+    }                                                                        \
   }
 
 FUNC_WITH_TYPES(ORGQR_BATCH_INSTANCE);
@@ -421,7 +419,6 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
     const int64_t a_stride_64 = static_cast<int64_t>(a_stride);
     const int64_t tau_stride_64 = static_cast<int64_t>(tau_stride);
 
-    // auto handle = dev_ctx.cusolver_dn_handle();
     auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
 
     size_t workspace_in_bytes_on_device = 0;
@@ -499,7 +496,6 @@ void BatchedGeqrf<GPUContext, float>(const GPUContext& dev_ctx,
   } else {
     int lwork = 0;
 
-    // auto handle = dev_ctx.cusolver_dn_handle();
     auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSgeqrf_bufferSize(
         handle, m, n, a, lda, &lwork));
@@ -555,7 +551,6 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
                                       int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(
       phi::dynload::cusolverDnDgeqrf_bufferSize(handle, m, n, a, lda, &lwork));
@@ -599,35 +594,33 @@ void BatchedGeqrf<GPUContext, double>(const GPUContext& dev_ctx,
 }
 
 template <>
-void BatchedGeqrf<GPUContext, phi::dtype::complex<float>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    phi::dtype::complex<float>* a,
-    int lda,
-    phi::dtype::complex<float>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedGeqrf<GPUContext, phi::complex64>(const GPUContext& dev_ctx,
+                                              int batch_size,
+                                              int m,
+                                              int n,
+                                              phi::complex64* a,
+                                              int lda,
+                                              phi::complex64* tau,
+                                              int a_stride,
+                                              int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgeqrf_bufferSize(
       handle, m, n, reinterpret_cast<cuComplex*>(a), lda, &lwork));
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<float>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<float>>(&workspace);
+  phi::complex64* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex64>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<float>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<float>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex64* a_working_ptr = &a[i * a_stride];
+    phi::complex64* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCgeqrf(
         handle,
@@ -657,35 +650,33 @@ void BatchedGeqrf<GPUContext, phi::dtype::complex<float>>(
 }
 
 template <>
-void BatchedGeqrf<GPUContext, phi::dtype::complex<double>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    phi::dtype::complex<double>* a,
-    int lda,
-    phi::dtype::complex<double>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedGeqrf<GPUContext, phi::complex128>(const GPUContext& dev_ctx,
+                                               int batch_size,
+                                               int m,
+                                               int n,
+                                               phi::complex128* a,
+                                               int lda,
+                                               phi::complex128* tau,
+                                               int a_stride,
+                                               int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgeqrf_bufferSize(
       handle, m, n, reinterpret_cast<cuDoubleComplex*>(a), lda, &lwork));
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<double>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<double>>(&workspace);
+  phi::complex128* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex128>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<double>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<double>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex128* a_working_ptr = &a[i * a_stride];
+    phi::complex128* tau_working_ptr = &tau[i * tau_stride];
     // compute geqrf
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZgeqrf(
         handle,
@@ -727,7 +718,6 @@ void BatchedOrgqr<GPUContext, float>(const GPUContext& dev_ctx,
                                      int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
@@ -784,7 +774,6 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
                                       int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDorgqr_bufferSize(
       handle, m, n, k, a, lda, tau, &lwork));
@@ -829,20 +818,18 @@ void BatchedOrgqr<GPUContext, double>(const GPUContext& dev_ctx,
 }
 
 template <>
-void BatchedOrgqr<GPUContext, phi::dtype::complex<float>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    int k,
-    phi::dtype::complex<float>* a,
-    int lda,
-    phi::dtype::complex<float>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedOrgqr<GPUContext, phi::complex64>(const GPUContext& dev_ctx,
+                                              int batch_size,
+                                              int m,
+                                              int n,
+                                              int k,
+                                              phi::complex64* a,
+                                              int lda,
+                                              phi::complex64* tau,
+                                              int a_stride,
+                                              int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCungqr_bufferSize(
       handle,
@@ -856,16 +843,16 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<float>>(
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<float>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<float>>(&workspace);
+  phi::complex64* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex64>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<float>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<float>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex64* a_working_ptr = &a[i * a_stride];
+    phi::complex64* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnCungqr(
         handle,
@@ -896,20 +883,18 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<float>>(
 }
 
 template <>
-void BatchedOrgqr<GPUContext, phi::dtype::complex<double>>(
-    const GPUContext& dev_ctx,
-    int batch_size,
-    int m,
-    int n,
-    int k,
-    phi::dtype::complex<double>* a,
-    int lda,
-    phi::dtype::complex<double>* tau,
-    int a_stride,
-    int tau_stride) {
+void BatchedOrgqr<GPUContext, phi::complex128>(const GPUContext& dev_ctx,
+                                               int batch_size,
+                                               int m,
+                                               int n,
+                                               int k,
+                                               phi::complex128* a,
+                                               int lda,
+                                               phi::complex128* tau,
+                                               int a_stride,
+                                               int tau_stride) {
   int lwork = 0;
 
-  // auto handle = dev_ctx.cusolver_dn_handle();
   auto handle = GetCusolverDnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
   PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZungqr_bufferSize(
       handle,
@@ -923,16 +908,16 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<double>>(
 
   DenseTensor workspace = DenseTensor();
   workspace.Resize(common::make_ddim({lwork}));
-  phi::dtype::complex<double>* workspace_ptr =
-      dev_ctx.template Alloc<phi::dtype::complex<double>>(&workspace);
+  phi::complex128* workspace_ptr =
+      dev_ctx.template Alloc<phi::complex128>(&workspace);
 
   DenseTensor info = DenseTensor();
   info.Resize(common::make_ddim({1}));
   int* info_d = dev_ctx.template Alloc<int>(&info);
 
   for (int i = 0; i < batch_size; ++i) {
-    phi::dtype::complex<double>* a_working_ptr = &a[i * a_stride];
-    phi::dtype::complex<double>* tau_working_ptr = &tau[i * tau_stride];
+    phi::complex128* a_working_ptr = &a[i * a_stride];
+    phi::complex128* tau_working_ptr = &tau[i * tau_stride];
     // compute orggr
     PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnZungqr(
         handle,
@@ -965,11 +950,15 @@ void BatchedOrgqr<GPUContext, phi::dtype::complex<double>>(
 
 }  // namespace phi
 
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(qr, GPU, ALL_LAYOUT, phi::QrKernel, float, double) {}
+#else
 PD_REGISTER_PLUGIN_KERNEL(qr,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::QrKernel,
                           float,
                           double,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::complex64,
+                          phi::complex128) {}
+#endif

From 1e042162a9f7cbb4c08b260bae373122fee1e827 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Mon, 15 Sep 2025 10:30:01 +0800
Subject: [PATCH 07/95] [Metax] fix illegal address access error in
 test_momentum_op (#12)

* [Metax] fix illegal address access error in test_momentum_op
---
 backends/metax_gpu/patch/tmp/mixed_vector.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/patch/tmp/mixed_vector.h b/backends/metax_gpu/patch/tmp/mixed_vector.h
index e7cf1e626c9..1dcca9c71b4 100644
--- a/backends/metax_gpu/patch/tmp/mixed_vector.h
+++ b/backends/metax_gpu/patch/tmp/mixed_vector.h
@@ -386,7 +386,8 @@ class MixVector {
 
   // the unify method to access CPU or CUDA data. immutable.
   const T *Data(phi::Place place) const {
-    if (place.GetType() == phi::AllocationType::GPU) {
+    if (place.GetType() == phi::AllocationType::GPU ||
+        place.GetType() == phi::AllocationType::CUSTOM) {
       return CUDAData(place);
     } else {
       return data();
@@ -395,7 +396,8 @@ class MixVector {
 
   // the unify method to access CPU or CUDA data. mutable.
   T *MutableData(phi::Place place) {
-    if (place.GetType() == phi::AllocationType::GPU) {
+    if (place.GetType() == phi::AllocationType::GPU ||
+        place.GetType() == phi::AllocationType::CUSTOM) {
       return CUDAMutableData(place);
     } else {
       return data();

From aca80a41f6f619d995f5944c584c3141fab3ce9e Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 15 Sep 2025 11:41:10 +0800
Subject: [PATCH 08/95] [Metax] fix cufft and fix some blas kernel apply (#13)

* [Metax] fix cufft and fix some blas kernel apply
---
 backends/metax_gpu/CMakeLists.txt     | 13 ++----
 backends/metax_gpu/patch/paddle.patch | 59 +++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index b22d7077e3b..6048b59e6c1 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -618,6 +618,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_grad_kernel_impl.h
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu
@@ -683,15 +684,9 @@ file(
   ${CMAKE_SOURCE_DIR}/kernels/flash_attn_kernel.cu
   ${CMAKE_SOURCE_DIR}/kernels/flashattn.cc)
 
-list(
-  REMOVE_ITEM
-  CUDA_SRCS
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/gru_compute.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/softmax.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/weight_only_gemv.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math/context_project.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/fft.cu)
+list(REMOVE_ITEM CUDA_SRCS
+     ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/softmax.cu
+     ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/weight_only_gemv.cu)
 
 file(
   GLOB
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 1935217baa0..8127caee61e 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -133,6 +133,26 @@ index c0080f0a5e..458ca3e2e8 100644
  }  // namespace dynload
  }  // namespace phi
  
+diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
+index 1547909d92..66b2779392 100644
+--- a/paddle/phi/backends/dynload/cufft.h
++++ b/paddle/phi/backends/dynload/cufft.h
+@@ -1,3 +1,4 @@
++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
+ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+ 
+ Licensed under the Apache License, Version 2.0 (the "License");
+@@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
+         cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
+       });                                                            \
+       EnforceCUFFTLoaded(#__name);                                   \
+-      static void* p_##__name = dlsym(cufft_dso_handle, #__name);    \
++      std::string replaced_name = #__name;                                  \
++      replaced_name =  replaced_name.replace(0,2,"mc");          \
++      static void* p_##__name = dlsym(cufft_dso_handle, replaced_name.c_str());    \
+       return reinterpret_cast<cufft_func>(p_##__name)(args...);      \
+     }                                                                \
+   };                                                                 \
 diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h
 index 59e92955c9..d2f8c2da15 100644
 --- a/paddle/phi/backends/dynload/cupti.h
@@ -437,6 +457,32 @@ index cb35feee32..64f5bd24ac 100644
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
  
+diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
+index 88663ec880..98b93072a3 100644
+--- a/paddle/phi/kernels/funcs/gru_compute.cu
++++ b/paddle/phi/kernels/funcs/gru_compute.cu
+@@ -12,7 +12,7 @@ limitations under the License. */
+ #include "paddle/phi/kernels/funcs/gru_compute.h"
+ 
+ #include "paddle/phi/backends/gpu/gpu_context.h"
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
+ #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
+ 
+diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
+index 15e1a4a3c3..e4780538d7 100644
+--- a/paddle/phi/kernels/funcs/math/context_project.h
++++ b/paddle/phi/kernels/funcs/math/context_project.h
+@@ -18,7 +18,7 @@
+ #include <vector>
+ 
+ #include "paddle/phi/core/tensor_utils.h"
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/im2col.h"
+ 
+ namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu
@@ -469,6 +515,19 @@ index 558d363b39..05da04b517 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
  
+diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
+index 8b0baf5f5f..260482f124 100644
+--- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
++++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
+@@ -27,7 +27,7 @@ namespace cub = hipcub;
+ 
+ #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
+ 
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
+ 
+ namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..3c74792690 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h

From fb547db298546f2c3249e22886c2232ba4882987 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 15 Sep 2025 16:04:35 +0800
Subject: [PATCH 09/95] [metax] add warpctc_warprnn (#14)

* [metax] fix bug
---
 backends/metax_gpu/CMakeLists.txt             |   2 +
 backends/metax_gpu/change_patch.sh            |   1 +
 backends/metax_gpu/cmake/warpctc.cmake        | 149 ++++++
 backends/metax_gpu/cmake/warprnnt.cmake       | 142 ++++++
 .../warpctc_grad_kernel_register.cu           |   2 +-
 .../cuda_kernels/warpctc_kernel_register.cu   |   2 +-
 .../kernels/impl/warpctc_kernel_impl.h        |   3 +-
 .../kernels/impl/warprnnt_kernel_impl.h       |   6 +-
 backends/metax_gpu/patch/intrinsics.cuh       | 459 ++++++++++++++++++
 backends/metax_gpu/patch/paddle.patch         |  26 +
 10 files changed, 787 insertions(+), 5 deletions(-)
 create mode 100644 backends/metax_gpu/cmake/warpctc.cmake
 create mode 100644 backends/metax_gpu/cmake/warprnnt.cmake
 create mode 100644 backends/metax_gpu/patch/intrinsics.cuh

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 6048b59e6c1..cca23ab42f5 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -37,6 +37,8 @@ include(cblas)
 include(flashattn)
 include(cutlass)
 include(dgc)
+include(warpctc)
+include(warprnnt)
 
 set(PLUGIN_VERSION ${PADDLE_VERSION})
 
diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh
index 833ae00f6bd..60d74ec0f3d 100644
--- a/backends/metax_gpu/change_patch.sh
+++ b/backends/metax_gpu/change_patch.sh
@@ -25,3 +25,4 @@ cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core
 cd ../../Paddle/
 git apply --verbose ../backends/metax_gpu/patch/paddle.patch
 cd -
+cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/
diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
new file mode 100644
index 00000000000..71c892a6cfa
--- /dev/null
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -0,0 +1,149 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+include(ExternalProject)
+
+if(WITH_ROCM)
+  add_definitions(-DWARPCTC_WITH_HIP)
+endif()
+
+set(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc)
+set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
+# in case of low internet speed set(WARPCTC_REPOSITORY
+# https://gitee.com/tianjianhe/warp-ctc.git)
+set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184)
+set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/warpctc)
+set(WARPCTC_PATCH_COMMAND "")
+set(WARPCTC_CCBIN_OPTION "")
+if(WIN32)
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      git checkout -- . && git checkout ${WARPCTC_TAG} && git apply
+      ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch)
+else()
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      git checkout -- . && git checkout ${WARPCTC_TAG} && patch -Nd
+      ${SOURCE_DIR} <
+      ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch)
+endif()
+
+if(NOT WIN32 AND WITH_GPU)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0 AND ${CMAKE_CXX_COMPILER_VERSION}
+                                                  VERSION_GREATER 12.0)
+    file(TO_NATIVE_PATH
+         ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.patch native_src)
+    set(WARPCTC_PATCH_COMMAND git checkout -- . && git checkout ${WARPCTC_TAG}
+                              && patch -Nd ${SOURCE_DIR} < ${native_src} &&)
+    set(WARPCTC_CCBIN_OPTION -DCCBIN_COMPILER=${CCBIN_COMPILER})
+  endif()
+endif()
+
+if(WITH_ROCM)
+  set(WARPCTC_PATHCH_ROCM_COMMAND
+      patch -p1 <
+      ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.rocm.patch && patch
+      -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/devicetypes.cuh.patch && patch
+      -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/hip.cmake.patch)
+endif()
+
+set(WARPCTC_INCLUDE_DIR
+    "${WARPCTC_INSTALL_DIR}/include"
+    CACHE PATH "Warp-ctc Directory" FORCE)
+# Used in unit test test_WarpCTCLayer
+set(WARPCTC_LIB_DIR
+    "${WARPCTC_INSTALL_DIR}/lib"
+    CACHE PATH "Warp-ctc Library Directory" FORCE)
+
+if(WIN32)
+  set(WARPCTC_LIBRARIES
+      "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-ctc Library" FORCE)
+else()
+  set(WARPCTC_LIBRARIES
+      "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-ctc Library" FORCE)
+endif()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"
+   OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"
+   OR WIN32)
+  set(USE_OMP OFF)
+else()
+  set(USE_OMP ON)
+endif()
+
+if(WIN32)
+  set(WARPCTC_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_C_FLAGS_DEBUG $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_C_FLAGS_RELEASE
+      $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS_RELEASE
+      $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPCTC_CXX_FLAGS_DEBUG
+      $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+else()
+  set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+  set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+  set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+  set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+endif()
+
+ExternalProject_Add(
+  extern_warpctc
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  SOURCE_DIR ${SOURCE_DIR}
+  PREFIX ${WARPCTC_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  PATCH_COMMAND
+  COMMAND ${WARPCTC_PATCH_COMMAND}
+  COMMAND ${WARPCTC_PATCH_CUDA_COMMAND}
+  COMMAND ${WARPCTC_PATHCH_ROCM_COMMAND}
+  # BUILD_ALWAYS    1
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG}
+             -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+             -DWITH_GPU=${WITH_GPU}
+             -DWITH_ROCM=${WITH_ROCM}
+             -DWITH_OMP=${USE_OMP}
+             -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
+             -DWITH_TORCH=OFF
+             -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+             -DBUILD_SHARED=ON
+             -DBUILD_TESTS=OFF
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR}
+             ${EXTERNAL_OPTIONAL_ARGS}
+             ${WARPCTC_CCBIN_OPTION}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
+  BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES})
+
+message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
+get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
+include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its
+                                            # headers.
+
+add_library(warpctc INTERFACE)
+add_dependencies(warpctc extern_warpctc)
diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake
new file mode 100644
index 00000000000..54a7ad6be86
--- /dev/null
+++ b/backends/metax_gpu/cmake/warprnnt.cmake
@@ -0,0 +1,142 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not
+# use this file except in compliance with the License. You may obtain a copy of
+# the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+# License for the specific language governing permissions and limitations under
+# the License.
+
+include(ExternalProject)
+
+if(WITH_ROCM)
+  add_definitions(-DWARPRNNT_WITH_HIP)
+endif()
+
+set(WARPRNNT_PREFIX_DIR ${THIRD_PARTY_PATH}/warprnnt)
+set(WARPRNNT_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warprnnt)
+set(WARPRNNT_TAG 7ea6bfe748779c245a0fcaa5dd9383826273eff2)
+set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/warprnnt)
+set(WARPRNNT_PATCH_COMMAND "")
+set(WARPRNNT_CCBIN_OPTION "")
+if(WIN32)
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      ${CMAKE_COMMAND} -E copy_if_different
+      ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.cuda.patch
+      "<SOURCE_DIR>/")
+else()
+  set(WARPCTC_PATCH_CUDA_COMMAND
+      git checkout -- . && git checkout ${WARPRNNT_TAG} && patch -Nd
+      ${SOURCE_DIR} <
+      ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.cuda.patch)
+endif()
+if(WITH_ROCM)
+  set(WARPRNNT_PATCH_ROCM_COMMAND
+      patch -p1 <
+      ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.rocm.patch)
+endif()
+if(NOT WIN32 AND WITH_GPU)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0 AND ${CMAKE_CXX_COMPILER_VERSION}
+                                                  VERSION_GREATER 12.0)
+    file(TO_NATIVE_PATH
+         ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.patch native_src)
+    set(WARPRNNT_PATCH_COMMAND
+        git checkout -- . && git checkout ${WARPRNNT_TAG} && patch -Nd
+        ${SOURCE_DIR} < ${native_src})
+    set(WARPRNNT_CCBIN_OPTION -DCCBIN_COMPILER=${CCBIN_COMPILER})
+  endif()
+endif()
+
+set(WARPRNNT_INCLUDE_DIR
+    "${WARPRNNT_INSTALL_DIR}/include"
+    CACHE PATH "Warp-rnnt Directory" FORCE)
+# Used in unit test test_WarpCTCLayer
+set(WARPRNNT_LIB_DIR
+    "${WARPRNNT_INSTALL_DIR}/lib"
+    CACHE PATH "Warp-rnnt Library Directory" FORCE)
+
+if(WIN32)
+  set(WARPRNNT_LIBRARIES
+      "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-rnnt Library" FORCE)
+else()
+  set(WARPRNNT_LIBRARIES
+      "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      CACHE FILEPATH "Warp-rnnt Library" FORCE)
+endif()
+
+if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang"
+   OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang"
+   OR WIN32)
+  set(USE_OMP OFF)
+else()
+  set(USE_OMP ON)
+endif()
+
+if(WIN32)
+  set(WARPRNNT_C_FLAGS $<FILTER:${CMAKE_C_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_C_FLAGS_DEBUG
+      $<FILTER:${CMAKE_C_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_C_FLAGS_RELEASE
+      $<FILTER:${CMAKE_C_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_CXX_FLAGS $<FILTER:${CMAKE_CXX_FLAGS},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_CXX_FLAGS_RELEASE
+      $<FILTER:${CMAKE_CXX_FLAGS_RELEASE},EXCLUDE,/Zc:inline>)
+  set(WARPRNNT_CXX_FLAGS_DEBUG
+      $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
+else()
+  set(WARPRNNT_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPRNNT_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
+  set(WARPRNNT_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
+  set(WARPRNNT_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPRNNT_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
+  set(WARPRNNT_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
+endif()
+ExternalProject_Add(
+  extern_warprnnt
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  SOURCE_DIR ${SOURCE_DIR}
+  PREFIX ${WARPRNNT_PREFIX_DIR}
+  UPDATE_COMMAND ""
+  PATCH_COMMAND
+  COMMAND ${WARPCTC_PATCH_CUDA_COMMAND}
+  COMMAND ${WARPRNNT_PATCH_ROCM_COMMAND}
+  # BUILD_ALWAYS    1
+  CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+             -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+             -DCMAKE_C_FLAGS=${WARPRNNT_C_FLAGS}
+             -DCMAKE_C_FLAGS_DEBUG=${WARPRNNT_C_FLAGS_DEBUG}
+             -DCMAKE_C_FLAGS_RELEASE=${WARPRNNT_C_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS=${WARPRNNT_CXX_FLAGS}
+             -DCMAKE_CXX_FLAGS_RELEASE=${WARPRNNT_CXX_FLAGS_RELEASE}
+             -DCMAKE_CXX_FLAGS_DEBUG=${WARPRNNT_CXX_FLAGS_DEBUG}
+             -DCMAKE_INSTALL_PREFIX=${WARPRNNT_INSTALL_DIR}
+             -DWITH_GPU=${WITH_GPU}
+             -DWITH_ROCM=${WITH_ROCM}
+             -DWITH_OMP=${USE_OMP}
+             -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
+             -DBUILD_SHARED=ON
+             -DBUILD_TESTS=OFF
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             ${EXTERNAL_OPTIONAL_ARGS}
+             ${WARPCTC_CCBIN_OPTION}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_INSTALL_PREFIX:PATH=${WARPRNNT_INSTALL_DIR}
+  BUILD_BYPRODUCTS ${WARPRNNT_LIBRARIES})
+
+message(STATUS "warp-rnnt library: ${WARPRNNT_LIBRARIES}")
+get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY)
+include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its
+                                             # headers.
+
+add_library(warprnnt INTERFACE)
+# set_property(TARGET warprnnt PROPERTY IMPORTED_LOCATION ${WARPRNNT_LIBRARIES})
+add_dependencies(warprnnt extern_warprnnt)
diff --git a/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu
index e77a29d12fe..d02f805a671 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu
@@ -17,7 +17,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/warpctc_grad_kernel.h"
 
-PD_REGISTER_PLUGIN_KERNEL(warpctc_grad,
+PD_CUSTOM_KERNEL_REGISTER(warpctc_grad,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::WarpctcGradKernel,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu
index 5b343506cad..c488e23fba9 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu
@@ -17,5 +17,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/warpctc_kernel.h"
 
-PD_REGISTER_PLUGIN_KERNEL(
+PD_CUSTOM_KERNEL_REGISTER(
     warpctc, metax_gpu, ALL_LAYOUT, phi::WarpctcKernel, float, double) {}
diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
index eb64f21c90f..9794ba1b3c0 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
@@ -204,7 +204,8 @@ class WarpCTCFunctor {
   void init(const Context& dev_ctx, const size_t blank) {
     warpctc_version_ = phi::dynload::get_warpctc_version();
 
-    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
+    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
+        dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       options_.loc = CTC_GPU;
       options_.stream =
diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
index 96e756b16b1..bb4311f5912 100644
--- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
@@ -138,7 +138,8 @@ class WarpRNNTFunctor {
     // There is no memory allocated operations within warp-rnnt.
     rnntStatus_t status = RNNT_STATUS_UNKNOWN_ERROR;
     bool gpu = false;
-    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
+    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
+        dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       gpu = true;
 #else
@@ -207,7 +208,8 @@ class WarpRNNTFunctor {
     options_.fastemit_lambda = fastemit_lambda;
     options_.batch_first = true;
 
-    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
+    if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
+        dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       options_.loc = RNNT_GPU;
       options_.stream =
diff --git a/backends/metax_gpu/patch/intrinsics.cuh b/backends/metax_gpu/patch/intrinsics.cuh
new file mode 100644
index 00000000000..71365b6577c
--- /dev/null
+++ b/backends/metax_gpu/patch/intrinsics.cuh
@@ -0,0 +1,459 @@
+/******************************************************************************
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ *
+ * Code and text by Sean Baxter, NVIDIA Research
+ * See http://nvlabs.github.io/moderngpu for repository and documentation.
+ *
+ ******************************************************************************/
+
+#include "devicetypes.cuh"
+
+#pragma once
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+
+namespace mgpu {
+
+MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) {
+	return *reinterpret_cast<uint2*>(&x);
+}
+MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) {
+	return *reinterpret_cast<uint64*>(&x);
+}
+
+MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) {
+	return *reinterpret_cast<int64*>(&x);
+}
+
+MGPU_HOST_DEVICE int2 double_as_int2(double x) {
+	return *reinterpret_cast<int2*>(&x);
+}
+MGPU_HOST_DEVICE double int2_as_double(int2 x) {
+	return *reinterpret_cast<double*>(&x);
+}
+
+MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) {
+	reinterpret_cast<int*>(&d)[0] = x;
+}
+MGPU_HOST_DEVICE int GetDoubleX(double d) {
+	return double_as_int2(d).x;
+}
+MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) {
+	reinterpret_cast<int*>(&d)[1] = y;
+}
+MGPU_HOST_DEVICE int GetDoubleY(double d) {
+	return double_as_int2(d).y;
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// PTX for bfe and bfi
+
+#if __CUDA_ARCH__ >= 200
+
+MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) {
+	uint result;
+	asm("bfe.u32 %0, %1, %2, %3;" :
+		"=r"(result) : "r"(x), "r"(bit), "r"(numBits));
+	return result;
+}
+
+
+MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+	asm("bfi.b32 %0, %1, %2, %3, %4;" :
+		"=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits));
+	return result;
+}
+
+MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) {
+	uint ret;
+	asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+	return ret;
+}
+
+#endif // __CUDA_ARCH__ >= 200
+
+
+////////////////////////////////////////////////////////////////////////////////
+// shfl_up
+
+__device__ __forceinline__ float shfl_up(float var,
+	unsigned int delta, int width = 32) {
+
+#if __CUDA_ARCH__ >= 300
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	var = __shfl_up_sync(0xFFFFFFFF, var, delta, width);
+#else
+	var = __shfl_up(var, delta, width);
+#endif
+#endif
+	return var;
+}
+
+__device__ __forceinline__ double shfl_up(double var,
+	unsigned int delta, int width = 32) {
+
+#if __CUDA_ARCH__ >= 300
+	int2 p = mgpu::double_as_int2(var);
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width);
+	p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width);
+#else
+	p.x = __shfl_up(p.x, delta, width);
+	p.y = __shfl_up(p.y, delta, width);
+#endif
+	var = mgpu::int2_as_double(p);
+#endif
+
+	return var;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// shfl_add
+
+// MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) {
+// 	int result = 0;
+// #if __CUDA_ARCH__ >= 300
+// 	int mask = (WARP_SIZE - width)<< 8;
+// #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+// 	asm(
+// 		"{.reg .s32 r0;"
+// 		".reg .pred p;"
+// 		"shfl.up.sync.b32 r0|p, %1, %2, %3, 0xFFFFFFFF;"
+// 		"@p add.s32 r0, r0, %4;"
+// 		"mov.s32 %0, r0; }"
+// 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+// #else
+// 	asm(
+// 		"{.reg .s32 r0;"
+// 		".reg .pred p;"
+// 		"shfl.up.b32 r0|p, %1, %2, %3;"
+// 		"@p add.s32 r0, r0, %4;"
+// 		"mov.s32 %0, r0; }"
+// 		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+// #endif
+// #endif
+// 	return result;
+// }
+
+MGPU_DEVICE int shfl_add(int x, int offset, int width = 32)
+{
+#if __CUDA_ARCH__ >= 300
+    unsigned fullMask = 0xffffffffU;
+    unsigned mask = (width == 32) ? fullMask : ((1U << width) - 1U);
+    int src = 0;
+#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 9
+    src = __shfl_up_sync(mask, x, offset, width);   // CUDA 9+
+#else
+    src = __shfl_up(x, offset, width);              // CUDA 8-
+#endif
+    int lane = threadIdx.x & 31;
+    return (lane >= offset) ? (src + x) : x;
+#else
+    return x;
+#endif
+}
+
+MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) {
+	int result = 0;
+#if __CUDA_ARCH__ >= 300
+	int mask = (WARP_SIZE - width)<< 8;
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.sync.b32 r0|p, %1, %2, %3, 0xFFFFFFFF;"
+		"@p max.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#else
+	asm(
+		"{.reg .s32 r0;"
+		".reg .pred p;"
+		"shfl.up.b32 r0|p, %1, %2, %3;"
+		"@p max.s32 r0, r0, %4;"
+		"mov.s32 %0, r0; }"
+		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
+#endif
+#endif
+	return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// brev, popc, clz, bfe, bfi, prmt
+
+// Reverse the bits in an integer.
+MGPU_HOST_DEVICE uint brev(uint x) {
+#if __CUDA_ARCH__ >= 200
+	uint y = __brev(x);
+#else
+	uint y = 0;
+	for(int i = 0; i < 32; ++i)
+		y |= (1 & (x>> i))<< (31 - i);
+#endif
+	return y;
+}
+
+// Count number of bits in a register.
+MGPU_HOST_DEVICE int popc(uint x) {
+#if __CUDA_ARCH__ >= 200
+	return __popc(x);
+#else
+	int c;
+	for(c = 0; x; ++c)
+		x &= x - 1;
+	return c;
+#endif
+}
+
+// Count leading zeros - start from most significant bit.
+MGPU_HOST_DEVICE int clz(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __clz(x);
+#else
+	for(int i = 31; i >= 0; --i)
+		if((1<< i) & x) return 31 - i;
+	return 32;
+#endif
+}
+
+// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0.
+MGPU_HOST_DEVICE int ffs(int x) {
+#if __CUDA_ARCH__ >= 200
+	return __ffs(x);
+#else
+	for(int i = 0; i < 32; ++i)
+		if((1<< i) & x) return i + 1;
+	return 0;
+#endif
+}
+
+MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) {
+#if __CUDA_ARCH__ >= 200
+	return bfe_ptx(x, bit, numBits);
+#else
+	return ((1<< numBits) - 1) & (x>> bit);
+#endif
+}
+
+MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = bfi_ptx(x, y, bit, numBits);
+#else
+	if(bit + numBits > 32) numBits = 32 - bit;
+	uint mask = ((1<< numBits) - 1)<< bit;
+	result = y & ~mask;
+	result |= mask & (x<< bit);
+#endif
+	return result;
+}
+
+MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) {
+	uint result;
+#if __CUDA_ARCH__ >= 200
+	result = prmt_ptx(a, b, index);
+#else
+	result = 0;
+	for(int i = 0; i < 4; ++i) {
+		uint sel = 0xf & (index>> (4 * i));
+		uint x = ((7 & sel) > 3) ? b : a;
+		x = 0xff & (x>> (8 * (3 & sel)));
+		if(8 & sel) x = (128 & x) ? 0xff : 0;
+		result |= x<< (8 * i);
+	}
+#endif
+	return result;
+}
+
+// Find log2(x) and optionally round up to the next integer logarithm.
+MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) {
+	int a = 31 - clz(x);
+	if(roundUp) a += !MGPU_IS_POW_2(x);
+	return a;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// vset4
+
+#if __CUDA_ARCH__ >= 300
+
+// Performs four byte-wise comparisons and returns 1 for each byte that
+// satisfies the conditional, and zero otherwise.
+MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) {
+	uint result;
+	asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(c));
+	return result;
+}
+MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) {
+	uint result;
+	asm("vset4.u32.u32.eq %0, %1, %2, %3;" :
+		"=r"(result) : "r"(a), "r"(b), "r"(0));
+	return result;
+}
+#endif // __CUDA_ARCH__ >= 300
+
+MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_lt_add_ptx(a, b, c);
+#else
+	result = c;
+	if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001;
+	if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+
+MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) {
+	uint result;
+#if __CUDA_ARCH__ >= 300
+	result = vset4_eq_ptx(a, b);
+#else
+	result = 0;
+	if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001;
+	if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100;
+	if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000;
+	if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000;
+#endif
+	return result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+
+MGPU_HOST_DEVICE uint umulhi(uint x, uint y) {
+#if __CUDA_ARCH__ >= 100
+	return __umulhi(x, y);
+#else
+	uint64 product = (uint64)x * y;
+	return (uint)(product>> 32);
+#endif
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// ldg() function defined for all devices and all types. Only compiles to __ldg
+// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported
+// by __ldg in sm_32_intrinsics.h
+
+template<typename T>
+struct IsLdgType {
+	enum { value = false };
+};
+#define DEFINE_LDG_TYPE(T) \
+	template<> struct IsLdgType<T> { enum { value = true }; };
+
+template<typename T, bool UseLDG = IsLdgType<T>::value>
+struct LdgShim {
+	MGPU_DEVICE static T Ldg(const T* p) {
+		return *p;
+	}
+};
+
+#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400
+
+	// List of __ldg-compatible types from sm_32_intrinsics.h.
+	DEFINE_LDG_TYPE(char)
+	DEFINE_LDG_TYPE(short)
+	DEFINE_LDG_TYPE(int)
+	DEFINE_LDG_TYPE(long long)
+	DEFINE_LDG_TYPE(char2)
+	DEFINE_LDG_TYPE(char4)
+	DEFINE_LDG_TYPE(short2)
+	DEFINE_LDG_TYPE(short4)
+	DEFINE_LDG_TYPE(int2)
+	DEFINE_LDG_TYPE(int4)
+	DEFINE_LDG_TYPE(longlong2)
+
+	DEFINE_LDG_TYPE(unsigned char)
+	DEFINE_LDG_TYPE(unsigned short)
+	DEFINE_LDG_TYPE(unsigned int)
+	DEFINE_LDG_TYPE(unsigned long long)
+	DEFINE_LDG_TYPE(uchar2)
+	DEFINE_LDG_TYPE(uchar4)
+	DEFINE_LDG_TYPE(ushort2)
+	DEFINE_LDG_TYPE(ushort4)
+	DEFINE_LDG_TYPE(uint2)
+	DEFINE_LDG_TYPE(uint4)
+	DEFINE_LDG_TYPE(ulonglong2)
+
+	DEFINE_LDG_TYPE(float)
+	DEFINE_LDG_TYPE(double)
+	DEFINE_LDG_TYPE(float2)
+	DEFINE_LDG_TYPE(float4)
+	DEFINE_LDG_TYPE(double2)
+
+	template<typename T> struct LdgShim<T, true> {
+		MGPU_DEVICE static T Ldg(const T* p) {
+			return __ldg(p);
+		}
+	};
+#endif
+
+template<typename T>
+MGPU_DEVICE T ldg(const T* p) {
+	return LdgShim<T>::Ldg(p);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Fast division for 31-bit integers.
+// Uses the method in Hacker's Delight (2nd edition) page 228.
+// Evaluates for denom > 1 and x < 2^31.
+struct FastDivide {
+	uint denom;
+	uint coef;
+	uint shift;
+
+	MGPU_HOST_DEVICE uint Divide(uint x) {
+		return umulhi(x, coef)>> shift;
+	}
+	MGPU_HOST_DEVICE uint Modulus(uint x) {
+		return x - Divide(x) * denom;
+	}
+
+	explicit FastDivide(uint denom_) {
+		denom = denom_;
+		uint p = 31 + FindLog2(denom, true);
+		coef = (uint)(((1ull<< p) + denom - 1) / denom);
+		shift = p - 32;
+	}
+};
+
+#pragma GCC diagnostic pop
+
+} // namespace mgpu
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 8127caee61e..0283a443adb 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -1087,6 +1087,32 @@ index 6f03f76eeb..5fe2c3e7dc 100644
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
  
+diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
+index 7b85903776..3f4b298807 100644
+--- a/paddle/phi/kernels/impl/merged_momentum_impl.h
++++ b/paddle/phi/kernels/impl/merged_momentum_impl.h
+@@ -297,7 +297,7 @@ void MergedMomentumInnerCompute(
+                 params_out[idx],
+                 velocities_out[idx]);
+         VLOG(10) << "Launch MergedMomentum cpu kernel.";
+-      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
++      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
+         phi::funcs::ForRange<Context> for_range(
+             static_cast<const Context &>(dev_ctx), params[idx]->numel());
+         const auto grad_type = grads[idx]->dtype();
+diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h
+index de5bcfc30b..eb2a9714f5 100644
+--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
++++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
+@@ -457,7 +457,7 @@ void MomentumDenseImpl(const Context& dev_ctx,
+             regularization_coeff,
+             param_out,
+             velocity_out);
+-  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
++  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
+     funcs::ForRange<Context> for_range(dev_ctx, param.numel());
+     const auto grad_type = grad.dtype();
+ #define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type)     \
 diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h

From 8e981985c3b9f2e6bfc3789d92b48fed42abace1 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Mon, 15 Sep 2025 17:40:04 +0800
Subject: [PATCH 10/95] [Metax] update metax CI (#15)

* [Metax] update metax CI
---
 backends/metax_gpu/tests/CMakeLists.txt       | 100 ++++-
 .../check_diff_metax_legacy_unit_test.sh      | 108 +++++
 .../tests/unit_test/test_abs_metax.py         |  39 ++
 .../tests/unit_test/test_arange_metax.py      | 260 ++++++++++++
 .../test_bfloat16_embedding_metax.py          |  72 ++++
 .../unit_test/test_count_nonzero_api_metax.py |  81 ++++
 .../unit_test/test_gaussian_nll_loss_metax.py | 208 +++++++++
 .../tests/unit_test/test_greater_equal.py     |  44 ++
 ...bate_build_src_rank_and_local_expert_id.py |  62 +++
 ...test_incubate_expand_modality_expert_id.py | 172 ++++++++
 .../test_incubate_fused_rmsnorm_ext_metax.py  |  95 +++++
 .../unit_test/test_incubate_moe_combine.py    | 193 +++++++++
 ...moe_gate_dispatch_partial_nosoftmaxtopk.py | 218 ++++++++++
 ...st_incubate_moe_gate_dispatch_w_permute.py | 207 +++++++++
 ...ncubate_moe_gate_dispatch_w_permute_bwd.py | 175 ++++++++
 .../tests/unit_test/test_layer_norm.py        | 358 ++++++++++++++++
 .../tests/unit_test/test_matmul_op__metax.py  | 395 ++++++++++++++++++
 .../tests/unit_test/test_nonzero_api_metax.py | 220 ++++++++++
 .../tests/unit_test/test_p_norm_op_metax.py   | 215 ++++++++++
 .../tests/unit_test/test_squeeze_op_metax.py  | 125 ++++++
 .../tests/unit_test/test_swiglu_metax.py      | 295 +++++++++++++
 .../tests/unit_test/test_top_p_sampling.py    | 162 +++++++
 .../unit_test/test_unsqueeze_op_metax.py      |  98 +++++
 23 files changed, 3894 insertions(+), 8 deletions(-)
 create mode 100644 backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh
 create mode 100644 backends/metax_gpu/tests/unit_test/test_abs_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_arange_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_greater_equal.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_layer_norm.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_swiglu_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index d2e92f209ab..7e549ef4eaa 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -5,22 +5,106 @@ enable_testing()
 
 find_package(Python REQUIRED COMPONENTS Interpreter)
 
-file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "unittest/*.py")
+set(PADDLE_LEGACY_TEST_PATH
+    ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test)
+set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
+
+file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py")
 
 list(
   APPEND
   PYTHON_TEST_SCRIPTS
-  ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test/test_tril_triu_op.py
-)
+  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_where_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_split_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_fill_constant_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_empty_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_sign_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_unbind_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_put_along_axis_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_maximum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_strided_slice_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_set_value_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_flatten_contiguous_range_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_subtract_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_greater_equal_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_one_hot_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_fill_any_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_reshape_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_bitwise_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_pad_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_zeros_like_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_shape_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_bincount_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_assign_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_fused_bias_act_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_adamw_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_nd_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_concat_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_nd_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_floordiv_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_mul_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_numel_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scale_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_reduce_op.py)
 
 list(
   REMOVE_ITEM
   PYTHON_TEST_SCRIPTS
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_cumsum_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_softmax_with_cross_entropy_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_expand_v2_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_tril_triu_op_metax.py
-  ${CMAKE_CURRENT_LIST_DIR}/unittest/test_squared_l2_norm_op_metax.py)
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py)
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 foreach(test_script ${PYTHON_TEST_SCRIPTS})
diff --git a/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh b/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh
new file mode 100644
index 00000000000..86bfcb08f86
--- /dev/null
+++ b/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+SOURCE_DIR="backends/metax_gpu/tests/unittest"
+SEARCH_DIR="Paddle/test/legacy_test"
+PREFIX_FILE="metax_prefixes.txt"
+UNMATCHED_FILE="unmatched_files.txt"
+EXIST_FILE="existing_files.txt"
+MISS_FILE="missing_files.txt"
+
+# 检查源路径是否存在
+if [ ! -d "$SOURCE_DIR" ]; then
+    echo "错误: 源路径 '$SOURCE_DIR' 不存在或不是一个目录"
+    exit 1
+fi
+
+# 检查搜索路径是否存在
+if [ ! -d "$SEARCH_DIR" ]; then
+    echo "错误: 搜索路径 '$SEARCH_DIR' 不存在或不是一个目录"
+    exit 1
+fi
+
+# 第一步：提取前缀（根据新规则处理）
+echo "第一步：从 '$SOURCE_DIR' 提取文件前缀（按_op/_metax规则）..."
+> "$PREFIX_FILE"      # 清空前缀文件
+> "$UNMATCHED_FILE"   # 清空未匹配文件列表
+
+find "$SOURCE_DIR" -type f -name "*.py" | while read -r file; do
+    filename=$(basename "$file")
+    prefix=""
+
+    # 规则1：如果包含_op关键字，提取_op前的所有字符
+    if [[ "$filename" == *"_op"* ]]; then
+        prefix="${filename%%_op*}"
+        echo "提取前缀（_op规则）: $prefix (来自 $filename)"
+        echo "$prefix" >> "$PREFIX_FILE"
+
+    # 规则2：如果没有_op但有_metax，提取_metax前的所有字符
+    elif [[ "$filename" == *"_metax"* ]]; then
+        prefix="${filename%%_metax*}"
+        echo "提取前缀（_metax规则）: $prefix (来自 $filename)"
+        echo "$prefix" >> "$PREFIX_FILE"
+
+    # 规则3：都不包含，归类到未匹配
+    else
+        echo "未匹配的文件: $filename（不包含_op和_metax）"
+        echo "$filename" >> "$UNMATCHED_FILE"
+    fi
+done
+
+# 检查是否有提取到前缀或未匹配文件
+prefix_count=$(wc -l < "$PREFIX_FILE")
+unmatched_count=$(wc -l < "$UNMATCHED_FILE")
+
+echo "提取完成 - 有效前缀: $prefix_count 个，未匹配文件: $unmatched_count 个"
+
+if [ $prefix_count -eq 0 ] && [ $unmatched_count -eq 0 ]; then
+    echo "警告: 在 '$SOURCE_DIR' 中未找到任何以 '_metax.py' 结尾的文件"
+    exit 0
+fi
+
+# 第二步：在搜索路径中查找同名文件（仅搜索当前目录，不包括子文件夹）
+echo -e "\n第二步：在 '$SEARCH_DIR' 中搜索同名文件（深度为1）..."
+> "$EXIST_FILE"   # 清空存在文件列表
+> "$MISS_FILE"    # 清空缺失文件列表
+
+# 逐个处理每个前缀
+while read -r prefix; do
+    # 跳过空行
+    if [ -z "$prefix" ]; then
+        continue
+    fi
+
+    # 只在搜索路径的直接目录下查找（深度为1）
+    found=$(find "$SEARCH_DIR" -maxdepth 1 -type f -name "${prefix}_op.py" -print -quit)
+
+    if [ -n "$found" ]; then
+        echo "$prefix -> 找到文件: $found"
+        echo "${prefix}_op.py" >> "$EXIST_FILE"
+    else
+        echo "$prefix -> 未找到同名文件"
+        echo "$prefix" >> "$MISS_FILE"
+    fi
+done < "$PREFIX_FILE"
+
+# 输出结果统计
+exist_count=$(wc -l < "$EXIST_FILE")
+miss_count=$(wc -l < "$MISS_FILE")
+
+echo -e "\n处理完成！"
+echo "找到同名文件的前缀数量: $exist_count（已保存到 $EXIST_FILE）"
+echo "未找到同名文件的前缀数量: $miss_count（已保存到 $MISS_FILE）"
+echo "未匹配规则的文件数量: $unmatched_count（已保存到 $UNMATCHED_FILE）"
diff --git a/backends/metax_gpu/tests/unit_test/test_abs_metax.py b/backends/metax_gpu/tests/unit_test/test_abs_metax.py
new file mode 100644
index 00000000000..0dae6822bba
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_abs_metax.py
@@ -0,0 +1,39 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
+# # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.base.dygraph as dg
+
+
+class TestAbs(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32"]
+        self._places = [paddle.CustomPlace("metax_gpu", 0)]
+
+    def test_all_positive(self):
+        for dtype in self._dtypes:
+            x = 1 + 10 * np.random.random([13, 3, 3]).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    y = paddle.abs(paddle.to_tensor(x))
+                    np.testing.assert_allclose(np.abs(x), y.numpy(), rtol=1e-05)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_arange_metax.py b/backends/metax_gpu/tests/unit_test/test_arange_metax.py
new file mode 100644
index 00000000000..89308c33401
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_arange_metax.py
@@ -0,0 +1,260 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
+# #   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest, convert_float_to_uint16
+
+import paddle
+from paddle.base import core
+from paddle.static import Program, program_guard
+
+
+def arange_wrapper(start, end, step, dtype="float32"):
+    return paddle.arange(start, end, step, dtype)
+
+
+class TestArangeOp(OpTest):
+    def setUp(self):
+        self.op_type = "range"
+        self.init_config()
+        self.inputs = {
+            "Start": np.array([self.case[0]]).astype(self.dtype),
+            "End": np.array([self.case[1]]).astype(self.dtype),
+            "Step": np.array([self.case[2]]).astype(self.dtype),
+        }
+
+        self.outputs = {
+            "Out": np.arange(self.case[0], self.case[1], self.case[2]).astype(
+                self.dtype
+            )
+        }
+
+    def init_config(self):
+        self.dtype = np.float32
+        self.python_api = arange_wrapper
+        self.case = (0, 1, 0.2)
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+
+class TestFloatArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.float32
+        self.python_api = paddle.arange
+        self.case = (0, 5, 1)
+
+
+class TestFloat16ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.float16
+        self.python_api = paddle.arange
+        self.case = (0, 5, 1)
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and not support the bfloat16",
+)
+class TestBFloat16ArangeOp(OpTest):
+    def setUp(self):
+        self.op_type = "range"
+        self.init_config()
+        self.inputs = {
+            "Start": convert_float_to_uint16(self.start),
+            "End": convert_float_to_uint16(self.end),
+            "Step": convert_float_to_uint16(self.step),
+        }
+
+        self.outputs = {
+            "Out": convert_float_to_uint16(np.arange(self.start, self.end, self.step))
+        }
+
+    def init_config(self):
+        self.dtype = np.uint16
+        self.python_api = arange_wrapper
+        self.case = (0, 5, 1)
+        self.start = np.array([self.case[0]]).astype(np.float32)
+        self.end = np.array([self.case[1]]).astype(np.float32)
+        self.step = np.array([self.case[2]]).astype(np.float32)
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_pir=True, check_symbol_infer=False)
+
+
+class TestInt32ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.python_api = paddle.arange
+        self.case = (0, 5, 2)
+
+
+class TestFloat64ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.float64
+        self.python_api = paddle.arange
+        self.case = (10, 1, -2)
+
+
+class TestInt64ArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.int64
+        self.python_api = paddle.arange
+        self.case = (-1, -10, -2)
+
+
+class TestZeroSizeArangeOp(TestArangeOp):
+    def init_config(self):
+        self.dtype = np.int32
+        self.python_api = paddle.arange
+        self.case = (0, 0, 1)
+
+
+class TestArangeOpError(unittest.TestCase):
+    def test_static_errors(self):
+        with program_guard(Program(), Program()):
+            paddle.enable_static()
+            self.assertRaises(TypeError, paddle.arange, 10, dtype="int8")
+
+
+class TestArangeAPI(unittest.TestCase):
+    def test_out(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            x1 = paddle.arange(0, 5, 1, "float32")
+
+            place = (
+                paddle.CUDAPlace(0)
+                if core.is_compiled_with_cuda()
+                else paddle.CPUPlace()
+            )
+            exe = paddle.static.Executor(place)
+            out = exe.run(fetch_list=[x1])
+
+            expected_data = np.arange(0, 5, 1).astype(np.float32)
+            self.assertEqual((out == expected_data).all(), True)
+            self.assertListEqual(list(x1.shape), [5])
+        paddle.disable_static(place)
+
+
+class TestArangeImperative(unittest.TestCase):
+    def test_out(self):
+        place = (
+            paddle.CUDAPlace(0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
+        )
+        paddle.disable_static(place)
+        x1 = paddle.arange(0, 5, 1)
+        x2 = paddle.tensor.arange(5)
+        x3 = paddle.tensor.creation.arange(5)
+
+        start = paddle.to_tensor(np.array([0], "float32"))
+        end = paddle.to_tensor(np.array([5], "float32"))
+        step = paddle.to_tensor(np.array([1], "float32"))
+        x4 = paddle.arange(start, end, step, "int64")
+
+        expected_data = np.arange(0, 5, 1).astype(np.int64)
+        for x in [x1, x2, x3, x4]:
+            np.testing.assert_array_equal(x.numpy(), expected_data)
+
+        start_float = paddle.to_tensor(np.array([0.5], "float32"))
+        end_float = paddle.to_tensor(np.array([1.5], "float32"))
+        step_float = paddle.to_tensor(np.array([0.5], "float32"))
+        # all [start, end, step] is float
+        x5 = paddle.arange(start_float, end_float, step_float)
+        x5_expected_data = np.arange(0.5, 1.5, 0.5).astype(np.float32)
+        np.testing.assert_array_equal(x5.numpy(), x5_expected_data)
+        self.assertEqual(x5.numpy().dtype, np.float32)
+
+        # [start, end] is float , [step] is int
+        x6 = paddle.arange(start_float, end_float, 1)
+        x6_expected_data = np.arange(0.5, 1.5, 1).astype(np.float32)
+        np.testing.assert_array_equal(x6.numpy(), x6_expected_data)
+        self.assertEqual(x6.numpy().dtype, np.float32)
+
+        # [start] is float , [end] is int
+        x7 = paddle.arange(start_float, 1)
+        x7_expected_data = np.arange(0.5, 1).astype(np.float32)
+        np.testing.assert_array_equal(x7.numpy(), x7_expected_data)
+        self.assertEqual(x7.numpy().dtype, np.float32)
+
+        # [start] is float
+        x8 = paddle.arange(start_float)
+        x8_expected_data = np.arange(0.5).astype(np.float32)
+        np.testing.assert_array_equal(x8.numpy(), x8_expected_data)
+        self.assertEqual(x8.numpy().dtype, np.float32)
+
+        # [start] is int
+        x9 = paddle.arange(1)
+        x9_expected_data = np.arange(1).astype(np.int64)
+        np.testing.assert_array_equal(x9.numpy(), x9_expected_data)
+        self.assertEqual(x9.numpy().dtype, np.int64)
+
+        # [start] is float
+        x10 = paddle.arange(1.0)
+        x10_expected_data = np.arange(1).astype(np.float32)
+        np.testing.assert_array_equal(x10.numpy(), x10_expected_data)
+        self.assertEqual(x10.numpy().dtype, np.float32)
+
+        # [start] is np.int
+        x11 = paddle.arange(np.int64(10))
+        x11_expected_data = np.arange(10).astype(np.int64)
+        np.testing.assert_array_equal(x11.numpy(), x11_expected_data)
+        self.assertEqual(x11.numpy().dtype, np.int64)
+
+        # [start] is a big integer
+        x12 = paddle.arange(
+            start=0,
+            end=-9007199254740994,
+            step=-9007199254740993,
+        )
+
+        # numpy give wrong result here, so we generate 'x12_expected_data' manually
+        # x12_expected_data = np.arange(start=0, stop=-9007199254740994, step=-9007199254740993, dtype=np.int64)
+        x12_expected_data = np.array([0, -9007199254740993])
+
+        np.testing.assert_array_equal(x12.numpy(), x12_expected_data)
+        self.assertEqual(x12.numpy().dtype, np.int64)
+
+        # [start<end step<0]
+        x13 = paddle.arange(start=0, end=10, step=-1)
+
+        x13_expected_data = np.array([])
+        np.testing.assert_array_equal(x13.numpy(), x13_expected_data)
+
+        # [start>end step>0]
+        x14 = paddle.arange(start=10, end=0, step=1)
+
+        x14_expected_data = np.array([])
+        np.testing.assert_array_equal(x14.numpy(), x14_expected_data)
+
+        paddle.enable_static()
+
+
+class TestArangeStatic(unittest.TestCase):
+    def test_infermeta(self):
+        paddle.enable_static()
+        x = paddle.arange(0, 1 + 0.005, 0.005)
+        self.assertEqual(x.shape, [201])
+        paddle.disable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py b/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py
new file mode 100644
index 00000000000..f575d4eece0
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+
+
+class BF16EmbeddingTest(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 30
+        self.vocab_size = 1024
+        self.hidden_size = 512
+        self.seed = 10
+
+    def run_main(self, dtype):
+        ids, weight, dout = self.gen_random()
+        origin_dtype = weight.dtype
+        weight_cast = weight.astype(dtype)
+        out = F.embedding(ids, weight_cast)
+        dout = dout.astype(out.dtype)
+        dweight = paddle.autograd.grad(out, weight, dout)
+        return (
+            out.astype(origin_dtype).numpy(),
+            dweight[0].astype(origin_dtype).numpy(),
+        )
+
+    def gen_random(self):
+        np.random.seed(self.seed)
+        weight = np.random.random([self.vocab_size, self.hidden_size]).astype("float32")
+        ids = np.random.randint(low=0, high=self.vocab_size, size=[self.batch_size])
+        dout = np.random.random([self.batch_size, self.hidden_size]).astype("float32")
+
+        weight = paddle.to_tensor(weight)
+        weight.stop_gradient = False
+        ids = paddle.to_tensor(ids)
+        dout = paddle.to_tensor(dout)
+        return ids, weight, dout
+
+    def test_main(self):
+
+        ret1 = self.run_main("float32")
+        ret2 = self.run_main("bfloat16")
+        self.assertEqual(len(ret1), len(ret2))
+        for i, (r1, r2) in enumerate(zip(ret1, ret2)):
+            np.testing.assert_allclose(r1, r2, atol=1e-3, rtol=1e-2)
+
+
+class BF16EmbeddingTestOddHiddenSize(BF16EmbeddingTest):
+    def setUp(self):
+        self.batch_size = 30
+        self.vocab_size = 511
+        self.hidden_size = 512
+        self.seed = 20
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py b/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py
new file mode 100644
index 00000000000..57a5d0b1c97
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+np.random.seed(10)
+
+
+class TestCountNonzeroAPI(unittest.TestCase):
+    # test paddle.tensor.math.count_nonzero
+
+    def setUp(self):
+        self.x_shape = [2, 3, 4, 5]
+        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def test_api_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data("X", self.x_shape)
+            out1 = paddle.count_nonzero(x)
+            out2 = paddle.tensor.count_nonzero(x)
+            out3 = paddle.tensor.math.count_nonzero(x)
+            axis = np.arange(len(self.x_shape)).tolist()
+            out4 = paddle.count_nonzero(x, axis)
+            out5 = paddle.count_nonzero(x, tuple(axis))
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={"X": self.x}, fetch_list=[out1, out2, out3, out4, out5])
+        out_ref = np.count_nonzero(self.x)
+        for out in res:
+            np.testing.assert_allclose(out, out_ref, rtol=1e-05)
+
+    def test_api_dygraph(self):
+        paddle.disable_static(self.place)
+
+        def test_case(x, axis=None, keepdim=False):
+            x_tensor = paddle.to_tensor(x)
+            out = paddle.count_nonzero(x_tensor, axis=axis, keepdim=keepdim)
+            if isinstance(axis, list):
+                axis = tuple(axis)
+                if len(axis) == 0:
+                    axis = None
+
+            out_ref = np.count_nonzero(x, axis, keepdims=keepdim)
+            np.testing.assert_allclose(out.numpy(), out_ref, rtol=1e-05)
+
+        test_case(self.x)
+        test_case(self.x, None)
+        test_case(self.x, -1)
+        test_case(self.x, keepdim=True)
+        test_case(self.x, 2, keepdim=True)
+        test_case(self.x, [0, 2])
+        test_case(self.x, (0, 2))
+        test_case(self.x, (0, 1, 3))
+        test_case(self.x, [0, 1, 2, 3])
+        paddle.enable_static()
+
+    def test_errors(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data("X", [10, 12], "int32")
+            self.assertRaises(ValueError, paddle.count_nonzero, x, axis=10)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py b/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py
new file mode 100644
index 00000000000..73e389324f9
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py
@@ -0,0 +1,208 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.base import core
+
+np.random.seed(10)
+
+
+def ref_gaussian_nll_loss(
+    input, label, variance, full=False, eps=1e-6, reduction="none"
+):
+    if variance.shape != input.shape:
+        if input.shape[:-1] == variance.shape:
+            variance = np.expand_dims(variance, -1)
+        elif input.shape[:-1] == variance.shape[:-1] and variance.shape[-1] == 1:
+            pass
+        else:
+            raise ValueError("variance is of incorrect size")
+    if reduction != "none" and reduction != "mean" and reduction != "sum":
+        raise ValueError(reduction + " is not valid")
+
+    if np.any(variance < 0):
+        raise ValueError("var has negative entry/entries")
+
+    variance = variance.copy()
+    variance = np.clip(variance, a_min=eps, a_max=None)
+
+    loss = 0.5 * (np.log(variance) + (input - label) ** 2 / variance)
+    if full:
+        loss += 0.5 * np.log(2 * np.pi)
+
+    if reduction == "none":
+        return loss
+    elif reduction == "sum":
+        return [np.sum(loss)]
+    elif reduction == "mean":
+        return [np.mean(loss)]
+
+
+class TestGaussianNLLLossAPI(unittest.TestCase):
+    # test paddle.nn.functional.gaussian_nll_loss, paddle.nn.gaussian_nll_loss
+
+    def setUp(self, type=None):
+        self.shape = [10, 2]
+        if type in ["float16", "float64", "int32", "int64"]:
+            dtype = np.dtype(type)
+            self.input_np = np.random.random(self.shape).astype(dtype)
+            self.label_np = np.random.random(self.shape).astype(dtype)
+            self.variance_np = np.ones(self.shape).astype(dtype)
+        elif type == "broadcast1":
+            self.shape = [10, 2, 3]
+            self.broadcast_shape = [10, 2]
+            self.input_np = np.random.random(self.shape).astype(np.float32)
+            self.label_np = np.random.random(self.shape).astype(np.float32)
+            self.variance_np = np.ones(self.broadcast_shape).astype(np.float32)
+        elif type == "broadcast2":
+            self.shape = [10, 2, 3]
+            self.broadcast_shape = [10, 2, 1]
+            self.input_np = np.random.random(self.shape).astype(np.float32)
+            self.label_np = np.random.random(self.shape).astype(np.float32)
+            self.variance_np = np.ones(self.broadcast_shape).astype(np.float32)
+        else:
+            dtype = np.dtype("float32")
+            self.input_np = np.random.random(self.shape).astype(dtype)
+            self.label_np = np.random.random(self.shape).astype(dtype)
+            self.variance_np = np.ones(self.shape).astype(dtype)
+        if type == "test_err":
+            self.variance_np = -np.ones(self.shape).astype(np.float32)
+
+        self.place = (
+            paddle.CUDAPlace(0) if core.is_compiled_with_cuda() else paddle.CPUPlace()
+        )
+
+    def test_dynamic_case(self, type=None, full=False, reduction="none"):
+        self.setUp(type)
+        paddle.disable_static(self.place)
+
+        input_x = paddle.to_tensor(self.input_np)
+        label = paddle.to_tensor(self.label_np)
+        variance = paddle.to_tensor(self.variance_np)
+        if type in ["test_err", "int32", "int64"]:
+            self.assertRaises(
+                ValueError,
+                paddle.nn.functional.gaussian_nll_loss,
+                input=input_x,
+                label=label,
+                variance=variance,
+            )
+        else:
+            out_ref = ref_gaussian_nll_loss(
+                self.input_np,
+                self.label_np,
+                self.variance_np,
+                full=full,
+                reduction=reduction,
+            )
+            out1 = F.gaussian_nll_loss(
+                input_x, label, variance, full=full, reduction=reduction
+            )
+            gaussian_nll_loss = paddle.nn.GaussianNLLLoss(full, reduction=reduction)
+            out2 = gaussian_nll_loss(input_x, label, variance)
+
+            for r in [out1, out2]:
+                np.allclose(out_ref, r.numpy(), rtol=1e-5, atol=1e-5)
+        paddle.enable_static()
+
+    def test_static_case(self, type=None, full=False, reduction="none"):
+        self.setUp(type)
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            if type in ["int32", "int64", "float64"]:
+                input_x = paddle.static.data("Input_x", self.shape, type)
+                label = paddle.static.data("Label", self.shape, type)
+                variance = paddle.static.data("Variance", self.shape, type)
+            elif type in ["broadcast1", "broadcast2"]:
+                input_x = paddle.static.data("Input_x", self.shape)
+                label = paddle.static.data("Label", self.shape)
+                variance = paddle.static.data("Variance", self.broadcast_shape)
+            else:
+                input_x = paddle.static.data("Input_x", self.shape, "float32")
+                label = paddle.static.data("Label", self.shape, "float32")
+                variance = paddle.static.data("Variance", self.shape, "float32")
+            out1 = F.gaussian_nll_loss(
+                input_x, label, variance, full=full, reduction=reduction
+            )
+            gaussian_nll_loss = paddle.nn.GaussianNLLLoss(full, reduction=reduction)
+            out2 = gaussian_nll_loss(input_x, label, variance)
+            exe = paddle.static.Executor(self.place)
+            if type not in ["test_err", "int32", "int64"]:
+                out_ref = ref_gaussian_nll_loss(
+                    self.input_np,
+                    self.label_np,
+                    self.variance_np,
+                    full=full,
+                    reduction=reduction,
+                )
+                res = exe.run(
+                    feed={
+                        "Input_x": self.input_np,
+                        "Label": self.label_np,
+                        "Variance": self.variance_np,
+                    },
+                    fetch_list=[out1, out2],
+                )
+                for r in res:
+                    np.allclose(out_ref, r, rtol=1e-5, atol=1e-5)
+            else:
+                try:
+                    res = exe.run(
+                        feed={
+                            "Input_x": self.input_np,
+                            "Label": self.label_np,
+                            "Variance": self.variance_np,
+                        },
+                        fetch_list=[out1, out2],
+                    )
+                except ValueError:
+                    pass
+
+    def test_api(self):
+        self.test_dynamic_case()
+        self.test_static_case()
+
+    def test_float64(self):
+        self.test_dynamic_case("float64")
+        self.test_static_case("float64")
+
+    def test_broadcast(self):
+        self.test_dynamic_case("broadcast1")
+        self.test_static_case("broadcast1")
+
+    def test_broadcast_with_same_dim(self):
+        self.test_dynamic_case("broadcast2")
+        self.test_static_case("broadcast2")
+
+    def test_reduction(self):
+        self.test_dynamic_case(full=True, reduction="mean")
+        self.test_dynamic_case(full=True, reduction="sum")
+        self.test_static_case(full=True, reduction="mean")
+
+    def test_error(self):
+        self.test_dynamic_case("test_err")
+        self.test_static_case("test_err")
+
+    def test_int(self):
+        self.test_dynamic_case("int64")
+        self.test_dynamic_case("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_greater_equal.py b/backends/metax_gpu/tests/unit_test/test_greater_equal.py
new file mode 100644
index 00000000000..816d6075099
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_greater_equal.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle import static
+
+
+class Test_Greater_Equal_Op_Fp16(unittest.TestCase):
+    def test_api_fp16(self):
+        paddle.enable_static()
+        with static.program_guard(static.Program(), static.Program()):
+            label = paddle.to_tensor([3, 3], dtype="float16")
+            limit = paddle.to_tensor([3, 2], dtype="float16")
+            out = paddle.greater_equal(x=label, y=limit)
+            # if core.is_compiled_with_cuda():
+            #     place = paddle.CUDAPlace(0)
+            #     exe = static.Executor(place)
+            #     (res,) = exe.run(fetch_list=[out])
+            #     self.assertEqual((res == np.array([True, True])).all(), True)
+            place = paddle.CustomPlace(paddle.device.get_device().split(":")[0], 0)
+            exe = static.Executor(place)
+            (res,) = exe.run(fetch_list=[out])
+            self.assertEqual((res == np.array([True, True])).all(), True)
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
new file mode 100644
index 00000000000..b4e4282c5ce
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.incubate.nn.functional import build_src_rank_and_local_expert_id
+
+logger = logging.getLogger(__name__)
+
+
+class TestFusedCalculateAuxLoss(unittest.TestCase):
+    def test_build_src_rank_and_local_expert_id(self):
+        def orig_func(expert_num_global_list, num_local_experts):
+            send_rank_cpu = np.concatenate(  # TOO SLOW!!! break every thing
+                [
+                    np.full([j], i // num_local_experts, dtype="int32")
+                    for i, j in enumerate(expert_num_global_list)
+                ],
+                0,
+            )
+            local_expert_id_cpu = np.concatenate(
+                [
+                    np.full([j], i % num_local_experts, dtype="int32")
+                    for i, j in enumerate(expert_num_global_list)
+                ],
+                0,
+            )
+            send_rank = paddle.to_tensor(send_rank_cpu)
+            local_expert_id = paddle.to_tensor(local_expert_id_cpu)
+            return send_rank, local_expert_id
+
+        def fused_func(expert_num_global_tensor, expert_num_global, num_local_experts):
+            return build_src_rank_and_local_expert_id(
+                expert_num_global_tensor, expert_num_global, num_local_experts
+            )
+
+        expert_num_global = np.random.randint(0, 512, size=[12 * 8], dtype="int32")
+        expert_num_global_tensor = paddle.to_tensor(expert_num_global, dtype="int64")
+
+        s1, l1 = orig_func(expert_num_global, 12)
+        s2, l2 = fused_func(expert_num_global_tensor, expert_num_global, 12)
+        assert ((s1 - s2) == 0).all(), (s1, s2)
+        assert ((l1 - l2) == 0).all(), (l1, l2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
new file mode 100644
index 00000000000..2d5670ee739
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from collections import namedtuple
+from functools import partial
+
+from ernie_utils.moe_all_gather_layer import MOEAllGatherLayerV2
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import expand_modality_expert_id
+
+
+def fused_gate_logits_process_ref(self, gate_logits_lm, gate_logits_mm, token_type_ids):
+    """process gatelogits"""
+    top_k = self.k
+    num_expert_per_rank_per_modality = (
+        gate_logits_lm.shape[-1] // self.config.moe_world_size
+    )
+
+    @paddle.no_grad()
+    def shift_ids(ids, modality_offset):
+        # 现在认为所以模态的 expert 数都一样
+        rank = ids // num_expert_per_rank_per_modality
+        expert_id_in_rank = ids % num_expert_per_rank_per_modality
+        return (
+            rank * (num_expert_per_rank_per_modality * 2)
+            + expert_id_in_rank
+            + modality_offset * num_expert_per_rank_per_modality
+        )
+
+    if self.group_experts:
+        gate_logits_lm = gate_logits_lm.reshape([gate_logits_lm.shape[0], top_k, -1])
+        prob_lm = self.gate.act(gate_logits_lm)
+        weight_lm, expert_id_lm = prob_lm.topk(k=1, axis=-1)
+        weight_lm = weight_lm.reshape([gate_logits_lm.shape[0], -1])
+        expert_id_lm = expert_id_lm.reshape([gate_logits_lm.shape[0], -1])
+        group_size = gate_logits_lm.shape[-1]
+        scale = paddle.arange(0, top_k * group_size, group_size).unsqueeze(0)
+        expert_id_lm = expert_id_lm + scale
+    else:
+        prob_lm = self.gate.act(gate_logits_lm)
+        weight_lm, expert_id_lm = prob_lm.topk(k=top_k, axis=-1)
+    if token_type_ids is not None:
+        expert_id_lm = shift_ids(expert_id_lm, 0)
+    expert_id_lm.stop_gradient = True
+    lm_weight_and_expert_id = paddle.concat(
+        [weight_lm, expert_id_lm.astype("float32")], -1
+    )
+    if token_type_ids is None:
+        return (
+            lm_weight_and_expert_id,
+            prob_lm.reshape([prob_lm.shape[0], -1]),
+            None,
+        )
+
+    prob_mm = self.gate.act(gate_logits_mm)
+    weight_mm, expert_id_mm = prob_mm.topk(k=top_k, axis=-1)
+
+    expert_id_mm = shift_ids(expert_id_mm, 1)
+    expert_id_mm.stop_gradient = True
+
+    mm_weight_and_expert_id = paddle.concat(
+        [weight_mm, expert_id_mm.astype("float32")], -1
+    )
+
+    token_type_ids_float = token_type_ids[:, None].astype("float32")
+    weight_and_expert = (
+        1 - token_type_ids_float
+    ) * lm_weight_and_expert_id + token_type_ids_float * mm_weight_and_expert_id
+    return weight_and_expert, prob_lm.reshape([prob_lm.shape[0], -1]), prob_mm
+
+
+def test_expand_modality_expert_id():
+    def expand_id_one(
+        expert_id,
+        num_expert_per_modality,
+        k,
+        group_size,
+        modality_offset,
+        is_group_expert,
+    ):
+        orig_shape = expert_id.shape
+        expert_id = expert_id.reshape([-1])
+        xid = paddle.arange(len(expert_id))
+        if is_group_expert:
+            eid = xid % k
+            expert_id += eid * group_size
+
+        rank = expert_id // num_expert_per_modality
+        expert_id_in_rank = expert_id % num_expert_per_modality
+        ret = (
+            rank * (num_expert_per_modality * 2)
+            + expert_id_in_rank
+            + modality_offset * num_expert_per_modality
+        )
+        return ret.reshape(orig_shape)
+
+    S, E, k = 100, 24, 3
+    expert_id_mm = paddle.randint(0, 12, shape=[S, k])
+    num_expert_per_rank_per_modality = E // 2 // 4
+    group_size = E // 2 // k
+    print(f"num_expert_per_rank_per_modality: {num_expert_per_rank_per_modality}")
+    fused = expand_modality_expert_id(
+        expert_id_mm, num_expert_per_rank_per_modality, group_size, 1, True
+    )
+
+    nonfused = expand_id_one(
+        expert_id_mm, num_expert_per_rank_per_modality, k, group_size, 1, True
+    )
+    # num_expert_per_rank_per_modality, group_size
+    assert (fused == nonfused).all().item()
+
+    Config = namedtuple("Config", ["moe_world_size"])
+    Self = namedtuple(
+        "Self",
+        [
+            "config",
+            "k",
+            "gate",
+            "group_experts",
+            "moe_statics",
+            "use_correction_bias",
+        ],
+    )
+    Gate = namedtuple("Gate", ["act"])
+    fake_gate = Gate(act=partial(F.softmax, axis=-1))
+    fake_self = Self(
+        config=Config(
+            moe_world_size=8,
+        ),
+        k=k,
+        gate=fake_gate,
+        moe_statics=None,
+        use_correction_bias=False,
+        group_experts=True,
+    )
+
+    fake_logits = paddle.randn([S, E])
+    fake_logits_mm = paddle.randn([S, E])
+    token_type_ids = paddle.randint(0, 2, shape=[S])
+    w_and_e, prob_lm, prob_mm = MOEAllGatherLayerV2.fused_gate_logits_process_fused(
+        fake_self, fake_logits, fake_logits_mm, None
+    )
+    w_and_e_ref, prob_lm_ref, prob_mm_ref = fused_gate_logits_process_ref(
+        fake_self, fake_logits, fake_logits_mm, None
+    )
+    assert (prob_lm == prob_lm_ref).all().item()
+    assert (w_and_e == w_and_e_ref).all().item()
+    w, e = w_and_e_ref.chunk(2, axis=-1)
+
+
+class Test_expand_modality_expert_id_API(unittest.TestCase):
+    def test_dygraph(self):
+        test_expand_modality_expert_id()
+
+
+if __name__ == "__main__":
+
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py b/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py
new file mode 100644
index 00000000000..ca0a780e908
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py
@@ -0,0 +1,95 @@
+#  Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+from paddle.incubate.nn.functional import fused_rms_norm_ext
+
+
+class TestFusedRMSNorm(unittest.TestCase):
+    def setUp(self):
+        paddle.seed(2023)
+        np.random.seed(2023)
+
+    def rms_norm_reference(self, x, scale, bias=None, epsilon=1e-5):
+        variance = paddle.mean(paddle.square(x), axis=-1, keepdim=True)
+
+        rms = paddle.sqrt(variance + epsilon)
+        y = x / rms
+        y = y * scale.reshape([1, -1])
+        if bias is not None:
+            y = y + bias.reshape([1, -1])
+        return y, (1.0 / rms).squeeze(-1)
+
+    def test_2d_input(self):
+        rows, cols = 32, 64
+        x = paddle.randn([rows, cols])
+        scale = paddle.randn([cols])
+        y_fused, invvar_fused = fused_rms_norm_ext(x, scale)
+
+        y_ref, invvar_ref = self.rms_norm_reference(x, scale)
+
+        np.testing.assert_allclose(y_fused, y_ref, rtol=1e-5, atol=1e-5)
+        np.testing.assert_allclose(invvar_fused, invvar_ref, rtol=1e-5, atol=1e-5)
+
+    def test_without_bias(self):
+
+        rows, cols = 32, 64
+        x = paddle.randn([rows, cols])
+        scale = paddle.randn([cols])
+
+        y_fused, invvar_fused = fused_rms_norm_ext(x, scale)
+
+        y_ref, invvar_ref = self.rms_norm_reference(x, scale)
+
+        np.testing.assert_allclose(y_fused, y_ref, rtol=1e-5, atol=1e-5)
+        np.testing.assert_allclose(invvar_fused, invvar_ref, rtol=1e-5, atol=1e-5)
+
+    def test_backward(self):
+
+        rows, cols = 16, 32
+        x = paddle.randn([rows, cols], dtype="float32")
+        x.stop_gradient = False
+        scale = paddle.randn([cols], dtype="float32")
+        scale.stop_gradient = False
+
+        y_fused, invvar = fused_rms_norm_ext(x, scale)
+
+        loss = paddle.mean(y_fused)
+        loss.backward()
+
+        x_grad_fused = x.grad.clone()
+        scale_grad_fused = scale.grad.clone()
+
+        x.clear_gradient()
+        scale.clear_gradient()
+
+        y_ref, invvar_ref = self.rms_norm_reference(x, scale)
+        loss_ref = paddle.mean(y_ref)
+        loss_ref.backward()
+
+        x_grad_ref = x.grad
+        scale_grad_ref = scale.grad
+
+        np.testing.assert_allclose(x_grad_fused, x_grad_ref, rtol=1e-4, atol=1e-4)
+        np.testing.assert_allclose(
+            scale_grad_fused, scale_grad_ref, rtol=1e-4, atol=1e-4
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
new file mode 100644
index 00000000000..23df4e3457b
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import unittest
+
+import numpy as np
+from ernie_utils.moe_layer_uneven import GateCombine
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import moe_combine
+
+os.environ["FLAGS_flash_attn_version"] = "v1"
+os.environ["FLAGS_cudnn_deterministic"] = "1"
+os.environ["FLAGS_embedding_deterministic"] = "1"
+
+
+def combining(x, combine_weights, scatter_index, hard_gate=False):
+    """
+    Args:
+        x: Tensor[seq, dim]
+        combine_weights: [seq, k]
+        scatter_index:  ** [seq, k] **
+
+    Returns:
+        y: Tensor[s, dim]
+    """
+    x_gatherd = F.embedding(scatter_index, x)  # [s,k,dim]
+    if hard_gate:
+        return x_gatherd.squeeze(-2)
+    # logger.info(f'combinning: {combine_weights}')
+    y = (combine_weights.unsqueeze(-1) * x_gatherd).sum(1)
+    # y = paddle.matmul(combine_weights.unsqueeze(1), x_gatherd).squeeze()  # [s,1,k] @ [s,k,dim] -> [s,1,dim]
+    return y
+
+
+def baseline_result(x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy):
+    """baseline_result"""
+    scatter_index = paddle.to_tensor(scatter_index_numpy)
+    x = paddle.to_tensor(x_numpy).cast("float32")
+    x.stop_gradient = False
+
+    combine_weights = paddle.to_tensor(combine_weights_numpy).cast("float32")
+    combine_weights.stop_gradient = False
+
+    scatter_index = paddle.to_tensor(scatter_index_numpy)
+    grad = paddle.to_tensor(grad_numpy).cast("float32")
+
+    y = combining(x, combine_weights, scatter_index)
+    paddle.autograd.backward([y], [grad], True)
+    return [x.grad, combine_weights.grad, y]
+
+
+def test_moe_combine(x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy):
+    """baseline_result"""
+    x = paddle.to_tensor(x_numpy).cast("float32")
+    x.stop_gradient = False
+
+    combine_weights = paddle.to_tensor(combine_weights_numpy).cast("float32")
+    combine_weights.stop_gradient = False
+
+    scatter_index = paddle.to_tensor(scatter_index_numpy).cast("int32")
+    grad = paddle.to_tensor(grad_numpy).cast("float32")
+
+    y = GateCombine.apply(x, combine_weights, scatter_index)
+    paddle.autograd.backward([y], [grad], True)
+    # grad.backward()
+    return [x.grad, combine_weights.grad, y]
+
+
+def gen_test_case(S, K, Dim, capacity_factor, seed=1234):
+    """gen_test_case"""
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+    x_numpy = np.random.rand(int(S * capacity_factor), Dim).astype(np.float32)
+    combine_weights_numpy = np.random.rand(S, K).astype(np.float32)
+    scatter_index_numpy = np.random.permutation(max(x_numpy.shape[0], S * K))[
+        : S * K
+    ].astype("int64")
+    scatter_index_numpy = scatter_index_numpy.reshape([S, K])
+
+    combine_weights_numpy[scatter_index_numpy >= x_numpy.shape[0]] = 0
+    scatter_index_numpy[scatter_index_numpy >= x_numpy.shape[0]] = 0
+    grad_numpy = np.random.randn(S, Dim).astype(np.float32)
+    return x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy
+
+
+def testing(test_case):
+    """testing"""
+    [bl_x_grad, bl_combine_weights_grad, bl_y] = baseline_result(*test_case)
+    [fused_x_grad, fused_combine_weights_grad, fused_y] = test_moe_combine(*test_case)
+    np.testing.assert_allclose(
+        fused_y.astype("float32").numpy(),
+        bl_y.astype("float32").numpy(),
+        err_msg="fwd precision not pass",
+        rtol=1e-6,
+    )
+    np.testing.assert_allclose(
+        fused_x_grad.astype("float32").numpy(),
+        bl_x_grad.astype("float32").numpy(),
+        rtol=1e-6,
+        err_msg="bwd grad precision not pass",
+    )
+    np.testing.assert_allclose(
+        fused_combine_weights_grad.astype("float32").numpy(),
+        bl_combine_weights_grad.astype("float32").numpy(),
+        rtol=1e-6,
+    )
+
+
+class TestFused(unittest.TestCase):
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_cap_lt_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=1.8))
+
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_cap_eq_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=2))
+
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_cap_gt_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=2.2))
+
+    @unittest.skipIf(moe_combine is None, "test_moe_combine not installed")
+    def test_k_gt_2(
+        self,
+    ):
+        """
+        测试精度对齐的功能
+
+        Args:
+            无参，没有任何参数。
+
+        Returns:
+            NoneType：测试通过时返回None；测试失败时抛出异常。
+
+        """
+        testing(gen_test_case(S=1024, K=8, Dim=4096, capacity_factor=2))
+
+
+if __name__ == "__main__":
+
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
new file mode 100644
index 00000000000..4c209970629
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
@@ -0,0 +1,218 @@
+# ruff: noqa: C419
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+from paddle.incubate.nn.functional import (
+    moe_gate_dispatch,
+    moe_gate_dispatch_partial_nosoftmaxtopk,
+)
+
+
+def test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op():
+
+    s, d, e = 4, 100, 8
+    k, cap = 4, 3
+    local_expert_num = 2
+
+    # x = paddle.randn([s, d])
+    # gate_logits = paddle.randn([s, e])
+    x = paddle.arange(1, s + 1).unsqueeze(-1).expand([s, d]).astype("bfloat16")
+    x_ = x.clone().detach()
+
+    t = (
+        (paddle.arange(0, e)).unsqueeze(0) + paddle.arange(0, -s, -1).unsqueeze(-1)
+    ) % e
+    gate_logits = (1 / (t + 1)).astype("float32")
+    # gate_logits = F.softmax(paddle.randn([s,e]),-1).astype('float32')
+    gate_logits_ = gate_logits.clone().detach()
+    s = x.shape[0]
+    d = x.shape[1]
+    e = gate_logits.shape[1]
+    x.stop_gradient = False
+    x_.stop_gradient = False
+    gate_logits.stop_gradient = False
+    gate_logits_.stop_gradient = False
+    print(f"gate_logits:{gate_logits}")
+
+    def check_ascend(index_rev, chunks):
+        for idx in index_rev.split(chunks.tolist()):
+            if len(idx) > 2:
+                assert (paddle.diff(idx) >= 0).all(), (index_rev,)
+
+    ys, comm, scatter_idx = [], [], []
+    for ilocal_expert in range(0, e, local_expert_num):
+        combine_weihgts, expert_id = gate_logits.topk(k=k, axis=1)
+        (
+            y,
+            combine_weihgts,
+            scatter_index,
+            scatter_index_rev,
+            expert_offset,
+            expert_num_local,
+        ) = moe_gate_dispatch_partial_nosoftmaxtopk(
+            x,
+            combine_weihgts,
+            expert_id.astype("int32"),
+            k=k,
+            capacity=cap,
+            num_experts=gate_logits.shape[-1],
+            use_pad=False,
+            expert_start_index=ilocal_expert,
+            expert_end_index=ilocal_expert + local_expert_num,  # k  # cap
+            reverse_token_drop=False,
+        )
+        check_ascend(scatter_index_rev, expert_num_local)
+        print(f"y:{y.mean(-1)}")
+        print(f"combine_weihgts:{combine_weihgts}")
+        print(f"expert_num_local:{expert_num_local}")
+        print(f"scatter_index:{scatter_index.transpose([1,0])}")
+        print(f"scatter_index_rev:{scatter_index_rev}")
+
+        ys.append(y)
+        comm.append(combine_weihgts)
+        scatter_idx.append(scatter_index)
+
+    comm_sum = paddle.stack(comm).sum(0)
+    ys_sum = paddle.concat(ys)
+
+    (
+        y_,
+        combine_weihgts_,
+        scatter_index_,
+        expert_offset_,
+        expert_id_,
+    ) = moe_gate_dispatch(
+        x_,
+        gate_logits_,
+        None,
+        k=k,
+        capacity=cap,
+        use_pad=True,  # k  # cap
+    )
+    valid_y = y_.sum(-1) > 0.0
+    y_2 = y_[valid_y].squeeze()
+
+    print(
+        f"""
+    y: {ys_sum.astype("float32").mean(axis=-1)}
+    y_: {y_2.astype("float32").mean(axis=-1)}
+
+    comm-weight: {comm_sum}
+    comm-weight_: {combine_weihgts_}
+
+    expert_id:{expert_id}
+    scatter_index:{scatter_index}
+    scatter_index_rev: {scatter_index_rev}
+    expert_num_global:{expert_offset}
+    expert_num_local:{expert_num_local}
+    """
+    )
+
+    print("<<< begin backward>>>")
+
+    assert combine_weihgts_.shape == combine_weihgts.shape, (
+        combine_weihgts_.shape,
+        combine_weihgts.shape,
+    )
+
+    dysum, dcombine_weights_sum = paddle.ones_like(ys_sum), paddle.randn(
+        comm_sum.shape
+    ).astype(comm_sum.dtype)
+    dy_, dcombine_weights_ = paddle.ones_like(y_), paddle.ones_like(combine_weihgts_)
+    dy_[~valid_y] = 0
+
+    y_shapes = [len(y) for y in ys]
+    for dyy, yy, commm in zip(
+        paddle.split(dysum, y_shapes),
+        ys,
+        comm,
+    ):
+        print(f"dyy:{dyy.shape}, {yy.shape} {commm.shape}")
+        paddle.autograd.backward([yy, commm], [dyy, dcombine_weights_sum])
+        print(x.grad.astype("float32").mean(axis=-1))
+    print(f"bwd original:{y_.shape} {dy_.shape}")
+    paddle.autograd.backward([y_, combine_weihgts_], [dy_, dcombine_weights_])
+
+    print(x_.grad.astype("float32").mean(axis=-1))
+
+    print(
+        f"""
+    x: {x.grad.astype('float32').mean(axis=-1)}
+    x_: {x_.grad.astype('float32').mean(axis=-1)}
+    """
+    )
+
+
+def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop():
+
+    S, E, D = 3, 4, 3
+    k = 2
+    capacity = 2
+    x = (paddle.arange(S) + 1).unsqueeze(-1).expand([S, D]).astype("bfloat16")
+    cw = paddle.randn([S, k])
+    eid = paddle.to_tensor([[0, 1], [0, 1], [0, 2]], dtype="int32")  # 1  # 2  # 3
+    (
+        y,
+        cw_,
+        idx,
+        idx_rev,
+        num_ex_global,
+        num_ex_local,
+    ) = moe_gate_dispatch_partial_nosoftmaxtopk(
+        x, cw, eid, k, capacity, E, False, 0, 2, reverse_token_drop=True
+    )
+
+    y0, y1 = y.split([i for i in num_ex_local.tolist() if i > 0])
+    assert y0[:, 0].astype("int32").tolist() == [2, 3], y0[:, 0]
+    assert y1[:, 0].astype("int32").tolist() == [1, 2]
+
+
+def test_moe_ops_partial_nosoftmax_topk_empty_output():
+
+    S, E, D = 3, 4, 3
+    k = 2
+    capacity = 2
+    x = (paddle.arange(S) + 1).unsqueeze(-1).expand([S, D]).astype("bfloat16")
+    cw = paddle.randn([S, k])
+    paddle.device.synchronize()
+    eid = paddle.to_tensor([[0, 1], [0, 1], [0, 2]], dtype="int32")  # 1  # 2  # 3
+    (
+        y,
+        cw_,
+        idx,
+        idx_rev,
+        num_ex_global,
+        num_ex_local,
+    ) = moe_gate_dispatch_partial_nosoftmaxtopk(
+        x, cw, eid, k, capacity, E, False, 3, 4, reverse_token_drop=True
+    )
+    assert all([i == 0 for i in num_ex_local.tolist()]), num_ex_local
+
+
+class TestAddition(unittest.TestCase):
+    def test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op(self):
+        test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op()
+
+    def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop(self):
+        test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop()
+
+    def test_moe_ops_partial_nosoftmax_topk_empty_output(self):
+        test_moe_ops_partial_nosoftmax_topk_empty_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
new file mode 100644
index 00000000000..19752abd904
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
@@ -0,0 +1,207 @@
+# !/usr/bin/env python3
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import (
+    moe_gate_dispatch,
+    moe_gate_dispatch_permute,
+)
+
+os.environ["FLAGS_flash_attn_version"] = "v1"
+os.environ["FLAGS_cudnn_deterministic"] = "1"
+os.environ["FLAGS_embedding_deterministic"] = "1"
+
+
+class TestFused(unittest.TestCase):
+    def test_moe_ops(self):
+        """
+        test `moe-ops` w/ bias
+        """
+        S, E, D = 8192, 64, 128
+        k = 4
+        x = paddle.randn([S, D], dtype="bfloat16")
+        gate_logits = paddle.randn([S, E], dtype="float32")
+        x_ = x.clone()
+        gate_logits_ = gate_logits.clone()
+        x.stop_gradient = True
+        x_.stop_gradient = True
+        gate_logits.stop_gradient = True
+        gate_logits_.stop_gradient = True
+        bias = paddle.zeros([E], dtype="float32")
+        cap = 512
+
+        (
+            y,
+            combine_weihgts,
+            scatter_index,
+            expert_offset_,
+            expert_id_,
+        ) = moe_gate_dispatch(
+            x,
+            gate_logits,
+            None,
+            k=k,
+            capacity=cap,
+            use_pad=True,  # k  # cap
+        )
+
+        (
+            y_,
+            combine_weihgts_,
+            scatter_index_,
+            expert_offset_,
+            expert_id_,
+        ) = moe_gate_dispatch(
+            x_,
+            gate_logits_,
+            bias + 1,  # +1也不会破坏路由结果
+            k=k,
+            capacity=cap,
+            use_pad=True,  # k  # cap
+        )
+        bias_unbalanced = bias.clone()
+        bias_unbalanced[0] += 1
+        (
+            y__,
+            combine_weihgts__,
+            scatter_index__,
+            expert_offset__,
+            expert_id__,
+        ) = moe_gate_dispatch(
+            x_,
+            gate_logits_,
+            bias_unbalanced,
+            k=k,
+            capacity=cap,
+            use_pad=True,  # k  # cap
+        )
+        np.testing.assert_equal(
+            y.astype("float32").numpy(),
+            y_.astype("float32").numpy(),
+            err_msg="incubate w bias not match",
+        )
+        # bias 不影响 prob 概率
+        np.testing.assert_equal(
+            combine_weihgts.astype("float32").numpy(),
+            combine_weihgts_.astype("float32").numpy(),
+            err_msg="incubate w bias not match",
+        )
+        np.testing.assert_(
+            (y.astype("float32").numpy(0) != y__.astype("float32").numpy()).any(),
+        )
+
+
+class TestDispatchPermute(unittest.TestCase):
+    def get_detached_input(self, input, prob):
+        ret_input = input.detach()
+        ret_prob = prob.detach()
+        ret_input.stop_gradient = input.stop_gradient
+        ret_prob.stop_gradient = prob.stop_gradient
+        return ret_input, ret_prob
+
+    def get_stage_input_list(self, x, world_size, stage):
+        print(world_size, stage, x.shape)
+        x = x.reshape([world_size * stage, -1, x.shape[-1]])
+        stage_input_list = []
+        x_list = paddle.split(x, num_or_sections=(world_size * stage), axis=0)
+        for stage_id in range(stage):
+            stage_input_list.append(
+                paddle.unsqueeze(paddle.concat(x_list[stage_id::stage], axis=0), axis=0)
+            )
+        stage_input_list = paddle.concat(stage_input_list, axis=0)
+        return stage_input_list
+
+    def test_moe_permute_ops(self):
+        paddle.seed(2025)
+
+        test_cases = [
+            (8, 4, 2),
+            (64, 16, 32),
+            (1024, 1024, 1024),
+            (8, 2, 4),
+            (4096, 4096, 4096),
+        ]
+        cases = list(zip(*test_cases))
+        for _, case in enumerate(cases):
+            world_size, num_experts, num_tokens, k, hidden_size = case
+            capacity = num_tokens // k
+            stages = num_experts // world_size
+
+            input = paddle.randn([num_tokens, hidden_size], dtype="float32")
+            prob_logits = paddle.randn([num_tokens, num_experts], dtype="float32")
+            prob = F.softmax(prob_logits, axis=-1)
+            input.stop_gradient = False
+            prob.stop_gradient = False
+
+            compat_args = (None,)
+
+            ref_input, ref_prob = self.get_detached_input(input, prob)
+            (
+                ref_dispatched_input,
+                ref_combine_weights_unnorm,
+                ref_scatter_index,
+                ref_dispatch_mask,
+                _,
+            ) = moe_gate_dispatch(
+                ref_input,
+                ref_prob,
+                *compat_args,
+                k=k,
+                capacity=capacity,
+                use_pad=True,
+            )
+
+            ref_stage_input_list = self.get_stage_input_list(
+                ref_dispatched_input, world_size, stages
+            )
+
+            test_input, test_prob = self.get_detached_input(input, prob)
+            (
+                test_dispatched_input,
+                test_combine_weights_unnorm,
+                test_scatter_index,
+                test_dispatch_mask,
+                _,
+            ) = moe_gate_dispatch_permute(
+                test_input,
+                test_prob,
+                *compat_args,
+                k=k,
+                capacity=capacity,
+                world_size=world_size,
+            )
+
+            np.testing.assert_equal(
+                test_dispatched_input.shape,
+                ref_stage_input_list.shape,
+                err_msg="moe_permute_ops not match",
+            )
+            np.testing.assert_equal(
+                test_dispatched_input._md5sum(),
+                ref_stage_input_list._md5sum(),
+                err_msg="moe_permute_ops not match",
+            )
+
+
+if __name__ == "__main__":
+
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
new file mode 100644
index 00000000000..14991becc47
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
@@ -0,0 +1,175 @@
+# !/usr/bin/env python3
+
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+from paddle.incubate.nn.functional import (
+    moe_gate_dispatch,
+    moe_gate_dispatch_permute,
+)
+
+batch_size = 4
+hidden_size = 2
+k = 16
+capacity = 2
+num_experts = 16
+
+world_size = 2
+
+
+class TestLayer(paddle.nn.Layer):
+    def forward(self, x, gate_prob, k, capacity):
+        y, combine_weights, scatter_index, expert_offset, expert_id = moe_gate_dispatch(
+            x, gate_prob, None, k, capacity, True
+        )
+        return y, combine_weights, scatter_index, expert_offset, expert_id
+
+
+class TestLayerPermute(paddle.nn.Layer):
+    def forward(self, x, gate_prob, k, capacity):
+        (
+            y,
+            combine_weights,
+            scatter_index,
+            expert_offset,
+            expert_id,
+        ) = moe_gate_dispatch_permute(
+            x, gate_prob, None, k, capacity, world_size=world_size
+        )
+        return y, combine_weights, scatter_index, expert_offset, expert_id
+
+
+def check_backward_correctness(layer_cls):
+    paddle.seed(1024)
+
+    dtype = "bfloat16"
+    layer = layer_cls()
+    input = paddle.randn([batch_size, hidden_size])
+
+    gate_weight = paddle.randn([hidden_size, num_experts])
+    logits = paddle.matmul(input, gate_weight)
+    gate_prob = F.softmax(logits, axis=-1)
+    print(f"gate_prob: {gate_prob}")
+
+    input = paddle.cast(input, "bfloat16")
+    input.stop_gradient = False
+    gate_prob.stop_gradient = False
+
+    output, combine_weights, scatter_index, expert_offset, expert_id = layer(
+        input, gate_prob, k, capacity
+    )
+
+    print(f"output: {output}")
+    print(f"combine_weights: {combine_weights}")
+    print(f"scatter_index: {scatter_index}")
+    print(f"expert_offset: {expert_offset}")
+    print(f"expert_id: {expert_id}")
+
+    # output_g = paddle.randn(output.shape).astype(output.dtype)
+    # combine_weights_g = paddle.randn(combine_weights.shape).astype(combine_weights.dtype)
+    output_g = paddle.ones_like(output)
+    combine_weights_g = paddle.ones_like(combine_weights)
+    print(f"output_g: {output_g}")
+    print(f"combine_weights_g: {combine_weights_g}")
+
+    paddle.autograd.backward(
+        tensors=[output, combine_weights],
+        grad_tensors=[output_g, combine_weights_g],
+    )
+    # 数值估算
+    epsilon = 0.005
+    input_numpy = input.detach().astype("float32").numpy()
+    num_grad = paddle.zeros_like(input)
+    flattened = num_grad.reshape([-1])
+
+    for i in range(input.numel()):
+        input_pos = input_numpy.copy()
+        input_neg = input_numpy.copy()
+        input_pos.flat[i] += epsilon
+        input_neg.flat[i] -= epsilon
+
+        output_pos, _, _, _, _ = layer(
+            paddle.to_tensor(input_pos), gate_prob, k, capacity
+        )
+        output_neg, _, _, _, _ = layer(
+            paddle.to_tensor(input_neg), gate_prob, k, capacity
+        )
+
+        """
+        flattened[i] = (output_pos.astype("float32").numpy() - output_neg.astype("float32").numpy()).sum() / (
+            2 * epsilon
+        )
+        """
+        grad_value = (output_pos - output_neg).sum() / (2 * epsilon)
+        flattened[i] = grad_value
+
+    flattened = flattened.reshape(input.shape)
+
+    print(f"input gradient: {input.grad}")
+    print(f"numerical gradient: {flattened}")
+    np.testing.assert_allclose(
+        input.grad.astype("float32").numpy(),
+        flattened.astype("float32").numpy(),
+        rtol=1e-5,
+        atol=0,
+    )
+
+    # 数值估算 gate_prob
+    epsilon = 0.0005
+    gate_prob_numpy = gate_prob.detach().astype("float32").numpy()
+    num_grad = paddle.zeros_like(gate_prob)
+    flattened = num_grad.reshape([-1])
+
+    for i in range(gate_prob.numel()):
+        input_pos = gate_prob_numpy.copy()
+        input_neg = gate_prob_numpy.copy()
+        input_pos.flat[i] += epsilon
+        input_neg.flat[i] -= epsilon
+
+        _, output_pos, _, _, _ = layer(input, paddle.to_tensor(input_pos), k, capacity)
+        _, output_neg, _, _, _ = layer(input, paddle.to_tensor(input_neg), k, capacity)
+
+        grad_value = paddle.to_tensor(
+            (output_pos.numpy() - output_neg.numpy()).sum() / (2 * epsilon)
+        )
+        flattened[i] = grad_value
+
+    flattened = flattened.reshape(gate_prob.shape)
+
+    print(f"gate_prob gradient: {gate_prob.grad}")
+    print(f"numerical gradient: {flattened}")
+    np.testing.assert_allclose(
+        gate_prob.grad.astype("float32").numpy(),
+        flattened.astype("float32").numpy(),
+        rtol=1e-4,
+        atol=0,
+    )
+
+
+class TestFused(unittest.TestCase):
+    def test_moe_backward(self):
+        check_backward_correctness(TestLayer)
+
+    def test_moe_permute_backward(self):
+        check_backward_correctness(TestLayerPermute)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_layer_norm.py b/backends/metax_gpu/tests/unit_test/test_layer_norm.py
new file mode 100644
index 00000000000..dbeaee31f6c
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_layer_norm.py
@@ -0,0 +1,358 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from operator import mul
+import paddle.base.core as core
+import paddle.nn.functional as F
+import paddle.base as base
+from functools import reduce
+from op_test import _set_use_system_allocator
+from paddle.static.amp.fp16_utils import (
+    _keep_layer_norm_scale_bias_to_fp32,
+)
+from paddle.pir_utils import OldIrGuard
+
+paddle.enable_static()
+
+np.random.random(123)
+
+_set_use_system_allocator(True)
+
+
+def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+    x.shape = [N, D]
+
+    mean = np.mean(x, axis=1)
+    var = np.var(x, axis=1) + epsilon
+    output = np.divide((x - mean.reshape([N, 1])), (np.sqrt(var)).reshape([N, 1]))
+    if scale is not None:
+        output = scale.reshape([1, D]) * output
+    if beta is not None:
+        output = output + beta.reshape([1, D])
+
+    x.shape, output.shape = x_shape, x_shape
+    return output, mean, var
+
+
+def _reference_layer_norm_grad(x, grad_y, scale, bias, mean, var, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+
+    if scale is not None:
+        scale_shape = scale.shape
+        scale.shape = [1, D]
+    x.shape, grad_y.shape = [N, D], [N, D]
+    var.shape, mean.shape = [N, 1], [N, 1]
+
+    # d_bias
+    if bias is not None:
+        d_bias = np.sum(grad_y, axis=0).reshape([1, D])
+    else:
+        d_bias = None
+    # d_scale
+    if scale is not None:
+        d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y, axis=0).reshape(
+            [1, D]
+        )
+    else:
+        d_scale = None
+    # dx
+    if scale is not None:
+        dx_end = scale * np.sqrt(1.0 / var) * grad_y
+        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
+            [N, 1]
+        )  # the second part equals to zero.
+        d_mean = 1.0 / D * d_mean_0
+        d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape(
+            [N, 1]
+        ) * (1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
+    else:
+        dx_end = 1.0 * np.sqrt(1.0 / var) * grad_y
+        d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * 1.0, axis=1).reshape(
+            [N, 1]
+        )  # the second part equals to zero.
+        d_mean = 1.0 / D * d_mean_0
+        d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * 1.0, axis=1).reshape(
+            [N, 1]
+        ) * (1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
+
+    grad_x = dx_end + d_mean + d_std
+
+    grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape
+    var.shape, mean.shape = [N], [N]
+
+    if scale is not None:
+        scale.shape = scale_shape
+    return grad_x, d_scale, d_bias
+
+
+class TestLayerNormOp(unittest.TestCase):
+    def setUp(self):
+        self.init_dtype()
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+        self.__class__.use_custom_device = True
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        np.testing.assert_allclose(
+            np.array(tensor), np_array, rtol=1e-4, atol=atol, err_msg=msg
+        )
+
+    def check_forward_backward(
+        self,
+        shape,
+        begin_norm_axis,
+        has_scale=True,
+        has_bias=True,
+        y_grad_scale=1.0,
+        use_mkldnn=False,
+    ):
+        def test_with_place(place, shape, begin_norm_axis, use_mkldnn=use_mkldnn):
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1)
+            scale_shape = [D]
+
+            np.random.seed(123)
+            x = np.random.random_sample(x_shape).astype(self.dtype)
+            scale = (
+                np.random.random_sample(scale_shape).astype(np.float32)
+                if has_scale
+                else None
+            )
+            bias = (
+                np.random.random_sample(scale_shape).astype(np.float32)
+                if has_bias
+                else None
+            )
+            y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype(
+                self.dtype
+            )
+
+            # reference forward & backward
+            y, mean, variance = _reference_layer_norm_naive(
+                x, scale, bias, epsilon, begin_norm_axis
+            )
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, bias, mean, variance, begin_norm_axis
+            )
+            mean.shape = x_shape[0:begin_norm_axis]
+            variance.shape = x_shape[0:begin_norm_axis]
+
+            var_dict = locals()
+            var_dict["y@GRAD"] = y_grad
+            var_names = ["x", "mean", "variance", "y", "y@GRAD"]
+            if has_scale:
+                var_names += ["scale"]
+            if has_bias:
+                var_names += ["bias"]
+            ground_truth = {name: var_dict[name] for name in var_names}
+
+            with OldIrGuard():
+                program = base.Program()
+                old_program_guard = base.program_guard
+            with old_program_guard(program):
+                block = program.global_block()
+                for name in ground_truth:
+                    block.create_var(
+                        name=name, dtype=self.dtype, shape=ground_truth[name].shape
+                    )
+                inputs = {"X": block.var("x")}
+                fetch_list = [
+                    "y",
+                    "mean",
+                    "variance",
+                    "x@GRAD",
+                ]
+                if has_scale:
+                    inputs["Scale"] = block.var("scale")
+                    fetch_list += ["scale@GRAD"]
+                if has_bias:
+                    inputs["Bias"] = block.var("bias")
+                    fetch_list += ["bias@GRAD"]
+                layer_norm_op = block.append_op(
+                    type="layer_norm",
+                    inputs=inputs,
+                    outputs={
+                        "Y": block.var("y"),
+                        "Mean": block.var("mean"),  # share the same memory
+                        "Variance": block.var("variance"),  # share the same memory
+                    },
+                    attrs={
+                        "epsilon": epsilon,
+                        "begin_norm_axis": begin_norm_axis,
+                        "use_mkldnn": use_mkldnn,
+                    },
+                )
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), []
+                )
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+
+                program._sync_with_cpp()
+                exe = base.Executor(place)
+                with OldIrGuard():
+                    out = exe.run(
+                        program,
+                        feed={
+                            name: var_dict[name]
+                            for name in ["x", "scale", "bias", "y@GRAD"]
+                        },
+                        fetch_list=fetch_list,
+                    )
+
+                self.__assert_close(y, out[0], "y")
+                self.__assert_close(mean, out[1], "mean")
+                self.__assert_close(variance, out[2], "variance", 1e-3)
+                self.__assert_close(x_grad, out[3], "x_grad")
+                if has_scale:
+                    self.__assert_close(
+                        scale_grad.reshape(-1),
+                        out[fetch_list.index("scale@GRAD")],
+                        "scale_grad",
+                        1e-3,
+                    )
+                if has_bias:
+                    self.__assert_close(
+                        bias_grad.reshape(-1),
+                        out[fetch_list.index("bias@GRAD")],
+                        "bias_grad",
+                    )
+
+        test_with_place(self.place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=False, has_bias=True
+        )
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=True, has_bias=False
+        )
+        self.check_forward_backward(
+            shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=False, has_bias=False
+        )
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+        self.check_forward_backward(
+            shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1
+        )
+        self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2)
+        self.check_forward_backward(
+            shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=True,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=True,
+            has_bias=False,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[92, 513, 1134],
+            begin_norm_axis=2,
+            has_scale=False,
+            has_bias=False,
+            y_grad_scale=0.1,
+        )
+        self.check_forward_backward(
+            shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True
+        )
+
+
+class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
+    def check_main(self, x_np, weight_np, bias_np, dtype):
+        paddle.disable_static()
+
+        weight_np = weight_np.astype(dtype)
+        bias_np = bias_np.astype(dtype)
+
+        x = paddle.to_tensor(x_np)
+        weight = paddle.to_tensor(weight_np)
+        bias = paddle.to_tensor(bias_np)
+        x.stop_gradient = False
+        weight.stop_gradient = False
+        bias.stop_gradient = False
+        y = F.layer_norm(x, x.shape[1:], weight, bias)
+        x_g, w_g, b_g = paddle.grad(y, [x, weight, bias])
+        y_np = y.numpy().astype("float32")
+        x_g_np = x_g.numpy().astype("float32")
+        w_g_np = w_g.numpy().astype("float16")
+        b_g_np = b_g.numpy().astype("float32")
+
+        paddle.enable_static()
+        return y_np, x_g_np, w_g_np, b_g_np
+
+    def test_main(self):
+        paddle.set_device("metax_gpu")
+        x_np = np.random.random([10, 20]).astype("float16")
+        weight_np = np.random.random([20]).astype("float16")
+        bias_np = np.random.random([20]).astype("float16")
+
+        y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main(
+            x_np, weight_np, bias_np, "float16"
+        )
+        y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main(
+            x_np, weight_np, bias_np, "float32"
+        )
+
+        def assert_equal(x, y):
+            np.testing.assert_allclose(x, y)
+
+        assert_equal(y_np_1, y_np_2)
+        assert_equal(x_g_np_1, x_g_np_2)
+        assert_equal(w_g_np_1, w_g_np_2)
+        assert_equal(b_g_np_1, b_g_np_2)
+
+
+class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
+    def test_main(self):
+        self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
+        _keep_layer_norm_scale_bias_to_fp32(False)
+        self.assertFalse(_keep_layer_norm_scale_bias_to_fp32())
+        _keep_layer_norm_scale_bias_to_fp32(True)
+        self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py b/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
new file mode 100644
index 00000000000..7545e16d14d
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
@@ -0,0 +1,395 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+from tests.op_test import OpTest
+import paddle
+
+paddle.enable_static()
+SEED = 2022
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size,))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((Y.size,))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if abs(scale - 1.0) > 1e-09:
+        Out = Out * scale
+    return Out
+
+
+class TestBmmOp(OpTest):
+    """
+    case 0
+    """
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def config(self):
+        self.x_shape = (10, 2, 5)
+        self.y_shape = (10, 5, 8)
+
+    def init_kernel_type(self):
+        self.dtype = "float32"
+
+    def setUp(self):
+        self.set_metax_gpu()
+        self.init_kernel_type()
+        self.config()
+        self.op_type = "bmm"
+        x = np.random.random(self.x_shape).astype(self.dtype)
+        y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        x = -0.1 + 0.2 * x
+        y = -0.1 + 0.2 * y
+        result = reference_matmul(x, y)
+        result = result.astype(self.dtype)
+        self.inputs = {
+            "X": x,
+            "Y": y,
+        }
+        self.outputs = {"Out": result}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
+
+
+class TestBmmOp1(TestBmmOp):
+    """
+    case 1
+    """
+
+    def config(self):
+        self.x_shape = (40, 10, 10)
+        self.y_shape = (40, 10, 10)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
+
+
+class TestBmmOp2(TestBmmOp):
+    """
+    case 2
+    """
+
+    def config(self):
+        self.x_shape = (4, 10, 80)
+        self.y_shape = (4, 80, 1)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place,
+            ["X", "Y"],
+            "Out",
+            max_relative_error=1e-2,
+        )
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+class TestMatMulOp(OpTest):
+    """
+    basic case
+    """
+
+    def setUp(self):
+        self.set_metax_gpu()
+        self.op_type = "matmul_v2"
+        self.init_dtype()
+        self.init_alpha()
+        self.config()
+
+        X = np.random.random(self.x_shape).astype(self.dtype)
+        Y = np.random.random(self.y_shape).astype(self.dtype)
+        # -0.1 ~ 0.1
+        X = -0.1 + 0.2 * X
+        Y = -0.1 + 0.2 * Y
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, self.alpha)
+        Out = Out.astype(self.dtype)
+        self.inputs = {"X": X, "Y": Y}
+        self.attrs = {
+            "trans_x": self.transpose_X,
+            "trans_y": self.transpose_Y,
+            "alpha": self.alpha,
+        }
+        self.outputs = {"Out": Out}
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def config(self):
+        self.x_shape = (100,)
+        self.y_shape = (100,)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+    def init_alpha(self):
+        self.alpha = 1.0
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-7)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
+
+
+class TestMatMulOp1(TestMatMulOp):
+    """
+    case x_ndim == 1, y_ndim != 1
+    """
+
+    def config(self):
+        self.x_shape = (100,)
+        self.y_shape = (1, 3, 2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp2(TestMatMulOp):
+    """
+    case x_ndim != 1, y_ndim == 1
+    """
+
+    def config(self):
+        self.x_shape = (1, 2, 100, 1)
+        self.y_shape = (100,)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp3(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp4(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 100)
+        self.y_shape = (2, 100)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp5(TestMatMulOp):
+    """
+    case [M, K] x [K, N] = [M, N]
+    """
+
+    def config(self):
+        self.x_shape = (100, 2)
+        self.y_shape = (100, 2)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp6(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 2, 25)
+        self.y_shape = (25, 4)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp7(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 4, 25)
+        self.y_shape = (4, 25)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp8(TestMatMulOp):
+    """
+    case [B, M, K] x [K, N] =  [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (1, 25, 4)
+        self.y_shape = (25, 4)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp9(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp10(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 10, 5)
+        self.y_shape = (2, 10, 5)
+        self.transpose_X = True
+        self.transpose_Y = False
+
+
+class TestMatMulOp11(TestMatMulOp):
+    """
+    case [B, M, K] x  [B, K, N] = [B, M, N]
+    """
+
+    def config(self):
+        self.x_shape = (2, 5, 10)
+        self.y_shape = (2, 5, 10)
+        self.transpose_X = False
+        self.transpose_Y = True
+
+
+class TestMatMulOp12(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = 100
+        self.y_shape = (1, 2, 2, 100, 2)
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+class TestMatMulOp13(TestMatMulOp):
+    """
+    case to check the gradient for special case
+    """
+
+    def config(self):
+        self.x_shape = (2, 1, 100)
+        self.y_shape = 100
+        self.transpose_X = False
+        self.transpose_Y = False
+
+
+# TODO(metax_gpu): alpha will be supported in next version
+# --------------------test matmul alpha--------------------
+# def create_test_alpha_class(parent):
+#     class TestMatMulOpAlphaCase(parent):
+#         def init_alpha(self):
+#             self.alpha = 0.125
+
+#     cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
+#     TestMatMulOpAlphaCase.__name__ = cls_name
+#     globals()[cls_name] = TestMatMulOpAlphaCase
+
+# create_test_alpha_class(TestMatMulOp)
+# create_test_alpha_class(TestMatMulOp1)
+# create_test_alpha_class(TestMatMulOp2)
+# create_test_alpha_class(TestMatMulOp3)
+# create_test_alpha_class(TestMatMulOp4)
+# create_test_alpha_class(TestMatMulOp5)
+# create_test_alpha_class(TestMatMulOp6)
+# create_test_alpha_class(TestMatMulOp9)
+# create_test_alpha_class(TestMatMulOp10)
+# create_test_alpha_class(TestMatMulOp11)
+# create_test_alpha_class(TestMatMulOp12)
+# create_test_alpha_class(TestMatMulOp13)
+
+
+# --------------------test matmul fp16--------------------
+def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
+    class TestMatMulOpFp16Case(parent):
+        def init_kernel_type(self):
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place, atol=atol)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(
+                self.place, ["X", "Y"], "Out", max_relative_error=max_relative_error
+            )
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
+    TestMatMulOpFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpFp16Case
+
+
+create_test_fp16_class(TestMatMulOp)
+create_test_fp16_class(TestMatMulOp1)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py b/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py
new file mode 100644
index 00000000000..c9bccd2abb3
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py
@@ -0,0 +1,220 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest, convert_float_to_uint16
+
+import paddle
+from paddle import base
+from paddle.base import Program, program_guard
+
+
+def call_nonzero(x):
+    input = paddle.to_tensor(x)
+    return paddle.nonzero(x=input)
+
+
+class TestNonZeroAPI(unittest.TestCase):
+    def test_nonzero_api_as_tuple(self):
+        paddle.enable_static()
+        data = np.array([[1, 0], [0, 1]], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1, 2], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x, as_tuple=True)
+            self.assertEqual(type(y), tuple)
+            self.assertEqual(len(y), 2)
+            z = paddle.concat(list(y), axis=0)
+            exe = base.Executor(base.CPUPlace())
+
+            (res,) = exe.run(feed={"x": data}, fetch_list=[z], return_numpy=False)
+        expect_out = np.array([0, 1, 0, 1])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+        data = np.array([1, 1, 0], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x, as_tuple=True)
+            self.assertEqual(type(y), tuple)
+            self.assertEqual(len(y), 1)
+            z = paddle.concat(list(y), axis=0)
+            exe = base.Executor(base.CPUPlace())
+            (res,) = exe.run(feed={"x": data}, fetch_list=[z], return_numpy=False)
+        expect_out = np.array([0, 1])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+        data = np.zeros([10, 3, 0], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[10, 3, 0], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x, as_tuple=True)
+            self.assertEqual(type(y), tuple)
+            self.assertEqual(len(y), 3)
+            expect_out = np.zeros([0])
+            for item in y:
+                np.testing.assert_array_equal(expect_out, item)
+
+    def test_nonzero_api(self):
+        paddle.enable_static()
+        data = np.array([[1, 0], [0, 1]], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1, 2], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x)
+            exe = base.Executor(base.CPUPlace())
+            (res,) = exe.run(feed={"x": data}, fetch_list=[y], return_numpy=False)
+        expect_out = np.array([[0, 0], [1, 1]])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+        data = np.array([1, 1, 0], dtype="float32")
+        with program_guard(Program(), Program()):
+            x = paddle.static.data(name="x", shape=[-1], dtype="float32")
+            if not paddle.framework.use_pir_api():
+                x.desc.set_need_check_feed(False)
+            y = paddle.nonzero(x)
+            exe = base.Executor(base.CPUPlace())
+            (res,) = exe.run(feed={"x": data}, fetch_list=[y], return_numpy=False)
+        expect_out = np.array([[0], [1]])
+        np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05)
+
+    def test_dygraph_api(self):
+        data_x = np.array([[True, False], [False, True]])
+        with base.dygraph.guard():
+            x = paddle.to_tensor(data_x)
+            z = paddle.nonzero(x)
+            np_z = z.numpy()
+        expect_out = np.array([[0, 0], [1, 1]])
+
+
+# Base case
+class TestNonzeroOp(OpTest):
+    def setUp(self):
+        """Test where_index op with random value"""
+        np.random.seed(2023)
+        self.op_type = "where_index"
+        self.python_api = call_nonzero
+        self.init_shape()
+        self.init_dtype()
+
+        self.inputs = self.create_inputs()
+        self.outputs = self.return_outputs()
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def init_shape(self):
+        self.shape = [8, 8]
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def create_inputs(self):
+        return {"Condition": np.random.randint(5, size=self.shape).astype(self.dtype)}
+
+    def return_outputs(self):
+        return {"Out": np.transpose(np.nonzero(self.inputs["Condition"]))}
+
+
+class TestNonzeroComplex64Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [1, 2, 3]
+
+    def init_dtype(self):
+        self.dtype = np.complex64
+
+
+class TestNonzeroComplex128Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [1, 2, 3]
+
+    def init_dtype(self):
+        self.dtype = np.complex128
+
+
+class TestNonzeroFP32Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [2, 10, 2]
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestNonzeroFP16Op(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [3, 4, 7]
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestNonzeroBF16(OpTest):
+    def setUp(self):
+        """Test where_index op with bfloat16 dtype"""
+        np.random.seed(2023)
+        self.op_type = "where_index"
+        self.python_api = call_nonzero
+        self.init_shape()
+        self.init_dtype()
+
+        self.inputs = self.create_inputs()
+        self.outputs = self.return_outputs()
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=False)
+
+    def init_shape(self):
+        self.shape = [12, 9]
+
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def create_inputs(self):
+        return {
+            "Condition": convert_float_to_uint16(
+                np.random.randint(5, size=self.shape).astype(np.float32)
+            )
+        }
+
+    def return_outputs(self):
+        return {"Out": np.transpose(np.nonzero(self.inputs["Condition"]))}
+
+
+class TestZeroSizeOp(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [0, 10]
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+
+class TestZeroSizeOpCase2(TestNonzeroOp):
+    def init_shape(self):
+        self.shape = [0, 10]
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def test_check_output(self):
+        self.check_output(check_pir=True, check_symbol_infer=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py b/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py
new file mode 100644
index 00000000000..c1bc46517b6
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py
@@ -0,0 +1,215 @@
+#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle
+from tests.op_test import OpTest
+
+paddle.enable_static()
+
+
+def p_norm(x, axis, porder, keepdims=False, reduce_all=False):
+    r = []
+    if axis is None or reduce_all:
+        x = x.flatten()
+        if porder == np.inf:
+            r = np.amax(np.abs(x), keepdims=keepdims)
+        elif porder == -np.inf:
+            r = np.amin(np.abs(x), keepdims=keepdims)
+        else:
+            r = np.linalg.norm(x, ord=porder, keepdims=keepdims)
+    elif isinstance(axis, list or tuple) and len(axis) == 2:
+        if porder == np.inf:
+            axis = tuple(axis)
+            r = np.amax(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == -np.inf:
+            axis = tuple(axis)
+            r = np.amin(np.abs(x), axis=axis, keepdims=keepdims)
+        elif porder == 0:
+            axis = tuple(axis)
+            r = x.astype(bool)
+            r = np.sum(r, axis, keepdims=keepdims)
+        elif porder == 1:
+            axis = tuple(axis)
+            r = np.sum(np.abs(x), axis, keepdims=keepdims)
+        else:
+            axis = tuple(axis)
+            xp = np.power(np.abs(x), porder)
+            s = np.sum(xp, axis=axis, keepdims=keepdims)
+            r = np.power(s, 1.0 / porder)
+    else:
+        if isinstance(axis, list):
+            axis = tuple(axis)
+        r = np.linalg.norm(x, ord=porder, axis=axis, keepdims=keepdims)
+    r = r.astype(x.dtype)
+
+    return r
+
+
+class TestPnormOp(OpTest):
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+
+    def setUp(self):
+        self.set_metax_gpu()
+        self.op_type = "p_norm"
+        self.init_test_case()
+        x = (np.random.random(self.shape) + 0.5).astype(self.dtype)
+        norm = p_norm(x, self.axis, self.porder, self.keepdim)
+        self.inputs = {"X": x}
+        self.attrs = {
+            "epsilon": self.epsilon,
+            "axis": self.axis,
+            "keepdim": self.keepdim,
+            "porder": float(self.porder),
+        }
+        self.outputs = {"Out": norm}
+        self.gradient = self.calc_gradient()
+
+    def test_check_output(self):
+        if self.dtype == "float16":
+            self.check_output_with_place(paddle.CustomPlace("metax_gpu", 0), atol=5e-3)
+        else:
+            self.check_output_with_place(paddle.CustomPlace("metax_gpu", 0))
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            paddle.CustomPlace("metax_gpu", 0),
+            ["X"],
+            "Out",
+            user_defined_grads=self.gradient,
+        )
+
+    def init_test_case(self):
+        self.shape = [2, 3, 4, 5]
+        self.axis = 1
+        self.epsilon = 1e-12
+        self.porder = 2.0
+        self.keepdim = False
+        self.init_dtype()
+
+    def init_dtype(self):
+        self.dtype = "float32"
+
+    def calc_gradient(self):
+        self.attrs = {
+            "epsilon": self.epsilon,
+            "axis": self.axis,
+            "keepdim": self.keepdim,
+            "porder": float(self.porder),
+        }
+        x = self.inputs["X"]
+        porder = self.attrs["porder"]
+        axis = self.attrs["axis"]
+        if porder == 0:
+            grad = np.zeros(x.shape).astype(x.dtype)
+        elif porder in [float("inf"), float("-inf")]:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            x_abs = np.abs(x)
+            grad = np.sign(x)
+            grad[x_abs != norm] = 0.0
+        else:
+            norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
+            grad = (
+                np.power(norm, 1 - porder)
+                * np.power(np.abs(x), porder - 1)
+                * np.sign(x)
+            )
+
+        numel = 1
+        for s in x.shape:
+            numel *= s
+        numel /= x.shape[axis]
+        return [grad.astype(x.dtype) * 1 / numel]
+
+
+class TestPnormOp2(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = 2.0
+        self.keepdim = True
+        self.init_dtype()
+
+
+# class TestPnormOp3(TestPnormOp):
+#     def init_test_case(self):
+#         self.shape = [3, 20, 3]
+#         self.axis = 2
+#         self.epsilon = 1e-12
+#         self.porder = np.inf
+#         self.keepdim = True
+#         self.init_dtype()
+
+
+# class TestPnormOp4(TestPnormOp3):
+#     def init_test_case(self):
+#         self.shape = [3, 20, 3]
+#         self.axis = 2
+#         self.epsilon = 1e-12
+#         self.porder = -np.inf
+#         self.keepdim = True
+#         self.init_dtype()
+
+
+class TestPnormOp5(TestPnormOp):
+    def init_test_case(self):
+        self.shape = [3, 20, 3]
+        self.axis = 2
+        self.epsilon = 1e-12
+        self.porder = 0
+        self.keepdim = True
+        self.init_dtype()
+
+
+# class TestPnormOp6(TestPnormOp):
+#     def init_test_case(self):
+#         self.shape = [2, 3, 4, 5]
+#         self.axis = 1
+#         self.epsilon = 1e-12
+#         self.porder = 0.5
+#         self.keepdim = False
+#         self.init_dtype()
+
+
+class TestPnormOpfp16(TestPnormOp):
+    def init_dtype(self):
+        self.dtype = "float16"
+
+
+class TestPnormOp2fp16(TestPnormOp2):
+    def init_dtype(self):
+        self.dtype = "float16"
+
+
+# class TestPnormOp3fp16(TestPnormOp3):
+#     def init_dtype(self):
+#         self.dtype = "float16"
+
+
+# class TestPnormOp4fp16(TestPnormOp4):
+#     def init_dtype(self):
+#         self.dtype = "float16"
+
+
+class TestPnormOp5fp16(TestPnormOp5):
+    def init_dtype(self):
+        self.dtype = "float16"
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py b/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py
new file mode 100644
index 00000000000..c67e807397c
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+# import sys
+
+# sys.path.append("..")
+
+import numpy as np
+
+import paddle
+from tests.op_test import OpTest
+
+paddle.enable_static()
+
+
+# Correct: General.
+class TestSqueezeOp(OpTest):
+    def setUp(self):
+        self.op_type = "squeeze2"
+        self.init_test_case()
+        self.set_metax_gpu()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")}
+        self.init_attrs()
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.new_shape),
+        }
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 40)
+        self.axes = (0, 2)
+        self.new_shape = (3, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# class TestSqueezeBF16Op(OpTest):
+#     def setUp(self):
+#         self.op_type = "squeeze2"
+#         self.dtype = np.uint16
+#         self.init_test_case()
+#         self.set_metax_gpu()
+#         x = np.random.random(self.ori_shape).astype("float32")
+#         out = x.reshape(self.new_shape)
+#         self.inputs = {"X": convert_float_to_uint16(x)}
+#         self.init_attrs()
+#         self.outputs = {"Out": convert_float_to_uint16(out)}
+
+#     def set_metax_gpu(self):
+#         self.__class__.use_custom_device = True
+#         self.place = paddle.CustomPlace("metax_gpu", 0)
+
+#     def test_check_output(self):
+#         self.check_output()
+
+#     def test_check_grad(self):
+#         self.check_grad(["X"], "Out")
+
+#     def init_test_case(self):
+#         self.ori_shape = (1, 3, 1, 40)
+#         self.axes = (0, 2)
+#         self.new_shape = (3, 40)
+
+#     def init_attrs(self):
+#         self.attrs = {"axes": self.axes}
+
+
+# Correct: There is mins axis.
+class TestSqueezeOp1(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 40)
+        self.axes = (0, -2)
+        self.new_shape = (3, 40)
+
+
+# Correct: No axes input.
+class TestSqueezeOp2(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 20, 1, 5)
+        self.axes = ()
+        self.new_shape = (20, 5)
+
+
+# Correct: Just part of axes be squeezed.
+class TestSqueezeOp3(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (6, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (6, 5, 1, 4)
+
+
+# Correct: The demension of axis is not of size 1 remains unchanged.
+class TestSqueezeOp4(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (6, 1, 5, 1, 4, 1)
+        self.axes = (1, 2)
+        self.new_shape = (6, 5, 1, 4, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py b/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py
new file mode 100644
index 00000000000..40e46e70a21
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest
+
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import _C_ops
+from paddle.base import core
+from paddle.distributed.auto_parallel.static.dist_attribute import (
+    DistTensorSpec,
+    TensorDistAttr,
+)
+from paddle.incubate.nn.functional import swiglu as fused_swiglu_impl
+
+
+def swiglu(x, y, out_grad):
+    if isinstance(x, np.ndarray):
+        x = paddle.to_tensor(x)
+        y = paddle.to_tensor(y)
+        out_grad = paddle.to_tensor(out_grad)
+
+    origin_x = x.detach().clone()
+    origin_x.stop_gradient = False
+    x = origin_x
+
+    origin_y = y.detach().clone()
+    origin_y.stop_gradient = False
+    y = origin_y
+
+    dtype = x.dtype
+    need_convert = False
+    assert dtype == y.dtype
+    output_dtype = dtype
+    if paddle.is_compiled_with_cuda():
+        if dtype in [paddle.float16, paddle.bfloat16]:
+            output_dtype = paddle.float32
+            x = x.astype(output_dtype)
+            y = y.astype(output_dtype)
+            need_convert = True
+
+    out = F.silu(x) * y
+    if need_convert:
+        out = out.astype(dtype)
+    out.backward(out_grad)
+    ret = [
+        out.astype(output_dtype),
+        origin_x.grad.astype(output_dtype),
+        origin_y.grad.astype(output_dtype),
+    ]
+    return ret
+
+
+def fused_swiglu(x, y, out_grad):
+    x = x.detach().clone()
+    x.stop_gradient = False
+    if y is not None:
+        y = y.detach().clone()
+        y.stop_gradient = False
+    out = fused_swiglu_impl(x, y)
+    out.backward(out_grad)
+
+    output_dtype = x.dtype
+    if paddle.is_compiled_with_cuda():
+        if x.dtype in [paddle.float16, paddle.bfloat16]:
+            output_dtype = paddle.float32
+    ret = [
+        out.astype(output_dtype),
+    ]
+    if y is not None:
+        x_grad, y_grad = x.grad, y.grad
+    else:
+        x_grad, y_grad = paddle.split(x.grad, 2, axis=-1)
+
+    ret.append(x_grad.astype(output_dtype))
+    ret.append(y_grad.astype(output_dtype))
+    return ret
+
+
+tol_map = {
+    paddle.float64: [1e-8, 1e-8],
+    paddle.float32: [1e-6, 1e-6],
+    paddle.float16: [1e-3, 1e-3],
+    paddle.bfloat16: [1e-3, 1e-3],
+}
+
+
+class TestSwiGLUDygraph(unittest.TestCase):
+    def check_dygraph_impl(self, device, shape, dtype):
+        x = paddle.randn(shape, dtype=dtype)
+        y = paddle.randn(shape, dtype=dtype)
+        out_grad = paddle.randn(shape, dtype=dtype)
+
+        ret1 = swiglu(x, y, out_grad)
+        ret2 = fused_swiglu(x, y, out_grad)
+        ret3 = fused_swiglu(paddle.concat([x, y], axis=-1), None, out_grad)
+
+        atol, rtol = tol_map[dtype]
+        err_msg = f"Failed when device = {device}, dtype = {dtype}, shape = {shape}"
+        for t1, t2, t3 in zip(ret1, ret2, ret3):
+            t1, t2, t3 = t1.numpy(), t2.numpy(), t3.numpy()
+            np.testing.assert_allclose(t1, t2, atol=atol, rtol=rtol, err_msg=err_msg)
+            np.testing.assert_equal(t2, t3, err_msg=err_msg)
+
+    def check_dygraph(self, shape):
+        metas = [("cpu", paddle.float32), ("cpu", paddle.float64)]
+        if paddle.is_compiled_with_cuda():
+            metas.append(("gpu", paddle.float32))
+            metas.append(("gpu", paddle.float64))
+            metas.append(("gpu", paddle.float16))
+            prop = paddle.device.cuda.get_device_properties()
+            if prop.major >= 8:
+                metas.append(("gpu", paddle.bfloat16))
+
+        for device, dtype in metas:
+            origin_device = paddle.get_device()
+            paddle.set_device(device)
+            for with_split in [True]:
+                self.check_dygraph_impl(device, shape, dtype)
+            paddle.set_device(origin_device)
+
+    def check_static_graph(self, shape, dtype="float32"):
+        x = paddle.static.data(name="x", shape=shape, dtype=dtype)
+        y = paddle.static.data(name="y", shape=shape, dtype=dtype)
+        concated_x = paddle.static.data(
+            name="concated_x",
+            shape=[*shape[:-1], shape[-1] * 2],
+            dtype=dtype,
+        )
+        out1 = fused_swiglu_impl(x, y)
+        out2 = fused_swiglu_impl(concated_x)
+
+        concated_x_np = np.random.random(concated_x.shape).astype(dtype)
+        x_np, y_np = np.split(concated_x_np, 2, axis=-1)
+
+        exe = paddle.static.Executor()
+        t1, t2 = exe.run(
+            feed={"x": x_np, "y": y_np, "concated_x": concated_x_np},
+            fetch_list=[out1, out2],
+        )
+        np.testing.assert_equal(t1, t2)
+
+    def check_main(self, shape):
+        self.check_dygraph(shape)
+        paddle.enable_static()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            self.check_static_graph(shape)
+        paddle.disable_static()
+
+    def test_main(self):
+        self.check_main([8, 100])
+        self.check_main([4, 101])
+
+
+class TestSwigluOp(OpTest):
+    def config(self):
+        self.x_shape = (8, 128)
+        self.check_auto_parallel = True
+
+    def setUp(self):
+        self.config()
+        self.op_type = "swiglu"
+        self.prim_op_type = "comp"
+        self.python_api = fused_swiglu_impl
+        self.public_python_api = fused_swiglu_impl
+        x = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        y = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        out_grad = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        res = swiglu(x, y, out_grad)
+        self.inputs = {"x": x, "y": y}
+        self.outputs = {"out": res[0].numpy()}
+        self.placements = {
+            "x": [dist.Shard(1)],
+            "y": [dist.Shard(1)],
+            "out": [dist.Shard(1)],
+        }
+
+    def test_check_output(self):
+        self.check_output(check_prim_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["x", "y"],
+            "out",
+            check_auto_parallel=self.check_auto_parallel,
+            check_dygraph=1,
+            check_prim_pir=True,
+        )
+
+
+class TestSwigluOp2(TestSwigluOp):
+    def setUp(self):
+        self.config()
+        self.op_type = "swiglu"
+        self.prim_op_type = "comp"
+        self.python_api = fused_swiglu_impl
+        self.public_python_api = fused_swiglu_impl
+        x = np.random.uniform(-1, 1, self.x_shape).astype("float64")
+        tmp_inputs = np.split(x, 2, axis=-1)
+        x = tmp_inputs[0]
+        y = tmp_inputs[1]
+        out_grad = np.random.uniform(-1, 1, x.shape).astype("float64")
+        res = swiglu(x, y, out_grad)
+        self.inputs = {"x": x, "y": y}
+        self.outputs = {"out": res[0].numpy()}
+        self.placements = {
+            "x": [dist.Shard(1)],
+            "y": [dist.Shard(1)],
+            "out": [dist.Shard(1)],
+        }
+
+
+@unittest.skipIf(
+    not paddle.base.core.is_compiled_with_dist(),
+    "The spmd rule is should be tested with distributed=ON",
+)
+class TestSwigluSpmd(unittest.TestCase):
+    def setUp(self):
+        self.kernel = "swiglu"
+        self.rule = paddle.base.core.get_phi_spmd_rule(self.kernel)
+        x_shape = [64, 32]
+        process_mesh = dist.ProcessMesh(mesh=[0, 1, 2, 3])
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [-1, 0]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+        self.y_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+        self.out_dist_tensor_spec = DistTensorSpec(self.x_dist_tensor_spec)
+
+    def test_input_x_y(self):
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec, self.y_dist_tensor_spec
+        )
+        inferred_input_dist_attrs = result_dist_attrs[0]
+        inferred_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(inferred_input_dist_attrs), 2)
+        self.assertEqual(len(inferred_output_dist_attrs), 1)
+        self.assertEqual(inferred_output_dist_attrs[0].dims_mapping, [-1, 0])
+
+    def test_input_x_unshard_last_dim(self):
+        x_shape = [64, 32]
+        process_mesh = dist.ProcessMesh(mesh=[0, 1, 2, 3])
+        x_tensor_dist_attr = TensorDistAttr()
+        x_tensor_dist_attr.dims_mapping = [0, -1]
+        x_tensor_dist_attr.process_mesh = process_mesh
+        self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr)
+
+        result_dist_attrs = self.rule.infer_forward(
+            self.x_dist_tensor_spec, DistTensorSpec()
+        )
+        inferred_input_dist_attrs = result_dist_attrs[0]
+        inferred_output_dist_attrs = result_dist_attrs[1]
+        self.assertEqual(len(result_dist_attrs), 2)
+        self.assertEqual(len(inferred_input_dist_attrs), 2)
+        self.assertEqual(len(inferred_output_dist_attrs), 1)
+        self.assertEqual(inferred_output_dist_attrs[0].dims_mapping, [0, -1])
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(), "mamtul 0 size only with in cuda")
+class TestSwiglu0SizeDygraph(unittest.TestCase):
+    def test_swiglu(self):
+        x = paddle.ones([0, 128], dtype="float32")
+        y = paddle.ones([0, 128], dtype="float32")
+        x.stop_gradient = False
+        y.stop_gradient = False
+        out = fused_swiglu_impl(x, y)
+
+        dz = paddle.ones([0, 128], dtype="float32")
+
+        out = _C_ops.swiglu_grad(x, y, dz)
+
+        self.assertEqual(out[0].shape, x.shape)
+        self.assertEqual(out[1].shape, y.shape)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py b/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
new file mode 100644
index 00000000000..4369972255d
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
@@ -0,0 +1,162 @@
+# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
+# #   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+
+def TopPProcess(probs, top_p):
+    sorted_probs = paddle.sort(probs, descending=True)
+    sorted_indices = paddle.argsort(probs, descending=True)
+    cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
+
+    # Remove tokens with cumulative probs above the top_p, But keep at
+    # least min_tokens_to_keep tokens
+    sorted_indices_to_remove = cumulative_probs > top_p
+
+    # Keep the first token
+    sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove, dtype="int64")
+
+    sorted_indices_to_remove = paddle.static.setitem(
+        sorted_indices_to_remove,
+        (slice(None), slice(1, None)),
+        sorted_indices_to_remove[:, :-1].clone(),
+    )
+    sorted_indices_to_remove = paddle.static.setitem(
+        sorted_indices_to_remove, (slice(None), 0), 0
+    )
+
+    # Scatter sorted tensors to original indexing
+    sorted_indices = (
+        sorted_indices + paddle.arange(probs.shape[0]).unsqueeze(-1) * probs.shape[-1]
+    )
+    condition = paddle.scatter(
+        sorted_indices_to_remove.flatten(),
+        sorted_indices.flatten(),
+        sorted_indices_to_remove.flatten(),
+    )
+    condition = paddle.cast(condition, "bool").reshape(probs.shape)
+    probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs)
+    next_tokens = paddle.multinomial(probs)
+    next_scores = paddle.index_sample(probs, next_tokens)
+    return next_scores, next_tokens
+
+
+class TestTopPAPI(unittest.TestCase):
+    def setUp(self):
+        self.topp = 0.0
+        self.seed = 6688
+        self.batch_size = 3
+        self.vocab_size = 10000
+        self.dtype = "float32"
+        self.input_data = np.random.rand(self.batch_size, self.vocab_size)
+
+    def run_dygraph(self, place):
+        with paddle.base.dygraph.guard(place):
+            input_tensor = paddle.to_tensor(self.input_data, self.dtype)
+            topp_tensor = paddle.to_tensor(
+                [
+                    self.topp,
+                ]
+                * self.batch_size,
+                self.dtype,
+            ).reshape((-1, 1))
+
+            # test case for basic test case 1
+            paddle_result = paddle.tensor.top_p_sampling(
+                input_tensor, topp_tensor, seed=self.seed
+            )
+            ref_res = TopPProcess(input_tensor, self.topp)
+
+            np.testing.assert_allclose(
+                paddle_result[0].numpy(), ref_res[0].numpy(), rtol=1e-05
+            )
+            np.testing.assert_allclose(
+                paddle_result[1].numpy().flatten(),
+                ref_res[1].numpy().flatten(),
+                rtol=0,
+            )
+
+            # test case for basic test case 1
+            paddle_result = paddle.tensor.top_p_sampling(
+                input_tensor,
+                topp_tensor,
+                seed=-1,
+                k=5,
+                mode="non-truncated",
+                return_top=True,
+            )
+            ref_res = TopPProcess(input_tensor, self.topp)
+
+            np.testing.assert_allclose(
+                paddle_result[0].numpy(), ref_res[0].numpy(), rtol=1e-05
+            )
+            np.testing.assert_allclose(
+                paddle_result[1].numpy().flatten(),
+                ref_res[1].numpy().flatten(),
+                rtol=0,
+            )
+
+    def run_static(self, place):
+        paddle.enable_static()
+        with paddle.static.program_guard(
+            paddle.static.Program(), paddle.static.Program()
+        ):
+            input_tensor = paddle.static.data(
+                name="x", shape=[6, 1030], dtype=self.dtype
+            )
+            topp_tensor = paddle.static.data(
+                name="topp", shape=[6, 1], dtype=self.dtype
+            )
+            result = paddle.tensor.top_p_sampling(
+                input_tensor, topp_tensor, seed=self.seed
+            )
+            ref_res = TopPProcess(input_tensor, self.topp)
+            exe = paddle.static.Executor(place)
+            input_data = np.random.rand(6, 1030).astype(self.dtype)
+            paddle_result = exe.run(
+                feed={
+                    "x": input_data,
+                    "topp": np.array(
+                        [
+                            self.topp,
+                        ]
+                        * 6
+                    ).astype(self.dtype),
+                },
+                fetch_list=[
+                    result[0],
+                    result[1],
+                    ref_res[0],
+                    ref_res[1],
+                ],
+            )
+            np.testing.assert_allclose(paddle_result[0], paddle_result[2], rtol=1e-05)
+            np.testing.assert_allclose(paddle_result[1], paddle_result[3], rtol=1e-05)
+
+    def test_dygraph(self):
+        place = paddle.CustomPlace("metax_gpu", 0)
+        self.run_dygraph(place)
+
+    def test_static(self):
+        place = paddle.CustomPlace("metax_gpu", 0)
+        self.run_static(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py b/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py
new file mode 100644
index 00000000000..ff22c2c9ac9
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+
+from tests.op_test import OpTest
+import paddle
+
+paddle.enable_static()
+
+
+# Correct: General.
+class TestUnsqueezeOp(OpTest):
+    def setUp(self):
+        self.set_metax_gpu()
+        self.op_type = "unsqueeze2"
+        self.dtype = "float32"
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype(self.dtype)}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+
+    def set_metax_gpu(self):
+        self.__class__.use_custom_device = True
+        self.place = paddle.CustomPlace("metax_gpu", 0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ["X"], "Out")
+
+    def init_test_case(self):
+        self.ori_shape = (3, 40)
+        self.axes = (1, 2)
+        self.new_shape = (3, 1, 1, 40)
+
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes}
+
+
+# Correct: Single input index.
+class TestUnsqueezeOp1(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (-1,)
+        self.new_shape = (20, 5, 1)
+
+
+# Correct: Mixed input axis.
+class TestUnsqueezeOp2(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (20, 5)
+        self.axes = (0, -1)
+        self.new_shape = (1, 20, 5, 1)
+
+
+# Correct: There is duplicated axis.
+class TestUnsqueezeOp3(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (0, 3, 3)
+        self.new_shape = (1, 10, 2, 1, 1, 5)
+
+
+# Correct: Reversed axes.
+class TestUnsqueezeOp4(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (10, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (10, 1, 1, 2, 5, 1)
+
+
+# test float16
+class TestUnsqueezeOp5(TestUnsqueezeOp):
+    def init_test_case(self):
+        self.dtype = "float16"
+        self.ori_shape = (10, 2, 5)
+        self.axes = (3, 1, 1)
+        self.new_shape = (10, 1, 1, 2, 5, 1)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 528ec55971cd8e115b3d0a7e2103bd4ebf7493a5 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 16 Sep 2025 11:39:34 +0800
Subject: [PATCH 11/95] [Metax] update metax CI CMakeLists (#16)

* [Metax] update metax CI

* [Metax] update metax CI CMakeLists
---
 backends/metax_gpu/tests/CMakeLists.txt | 44 +++++++++++++++----------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 7e549ef4eaa..37475773026 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -87,24 +87,32 @@ list(
 list(
   REMOVE_ITEM
   PYTHON_TEST_SCRIPTS
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py)
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py # 精度问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py # 受 test_sum_op.py 影响
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py # 精度问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py # core.cudnnversion
+                                                                   # 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py # core.cudnnversion 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py # core.cudnnversion 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py # op_test.py 里
+                                                        # self._get_places()
+                                                        # 接口适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py # device == "gpu" 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py # core.cudnnversion 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py # paddle.device.cuda.get_device_properties
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py # needs check_grad with fp64
+                                                    # precision
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # op_test.py 里
+                                                   # self._get_places() 接口适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py # CUDAPinnedPlace 问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py # paddle.device.cuda.get_device_properties
+  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py # CUDAPinnedPlace 问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py # paddle.device.cuda.get_device_properties
+)
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 foreach(test_script ${PYTHON_TEST_SCRIPTS})

From 5b31405c13c32af5dbc826f7e8fec58e64a74322 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 15:02:29 +0800
Subject: [PATCH 12/95] [Metax] add github action (#18)

* [Metax] add github action

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 52 +++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 .github/workflows/metax_work.yaml

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
new file mode 100644
index 00000000000..0d3d2637cdd
--- /dev/null
+++ b/.github/workflows/metax_work.yaml
@@ -0,0 +1,52 @@
+name: padlle metax gpu test
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [opened, synchronize]
+    branches: [develop, release/**]
+    paths:
+      - "**"
+      - "!backends/**"
+      - "backends/metax_gpu/**"
+
+permissions: read-all
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  metax-gpu-test:
+    runs-on: paddle-metax-runner-set
+    steps:
+      - name: Checkout repository
+        run: |
+          git config --global user.name "GitHub Actions"
+          git config --global user.email "actions@github.com"
+
+          if [ "${{ github.event_name }}" == "pull_request" ]; then
+            BRANCH_NAME=${{ github.head_ref }}
+          else
+            BRANCH_NAME=${{ github.ref_name }}
+          fi
+
+          git clone \
+            --reference-if-able /home/runner/PaddleCustomDevice \
+            --depth=1 \
+            --shallow-submodules \
+            --jobs=8 \
+            --branch $BRANCH_NAME \
+            --recurse-submodules \
+            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
+
+
+      - name: compile
+        run: |
+          cd backends/metax_gpu
+          bash build.sh
+
+      - name: run test
+        run: |
+          cd backends/metax_gpu/tests
+          bash run_test.sh

From b93c971b17729f09733faf5400d7ba44f1e5f3f2 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 15:15:34 +0800
Subject: [PATCH 13/95] [metax] chang build (#19)

* [metax]chaneg build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index dd0ab3aab90..c288ea22312 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -20,7 +20,7 @@ set -e
 pip  uninstall paddlepaddle -y
 
 
-export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080
+# export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
@@ -50,7 +50,7 @@ fi
 echo "make_maca"
 cd build
 cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
-make_maca -j8
+make_maca -j60
 
 echo "install whl"
 pip install dist/paddle_metax_gpu*.whl --force-reinstall

From 6dbbe848d672a27bbbdded8e399ff5b1229c6647 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 16:04:55 +0800
Subject: [PATCH 14/95] change_build (#20)

* [metax]chaneg build

---------
---
 backends/metax_gpu/build.sh | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index c288ea22312..e52cddc6476 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -20,15 +20,18 @@ set -e
 pip  uninstall paddlepaddle -y
 
 
-# export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080
+# init paddle
+git submodule sync --recursive && git submodule update --init --recursive
+
+
+export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 # exit 1
-# init paddle
-git submodule sync --recursive && git submodule update --init --recursive
 
+unset http_proxy https_proxy
 
 # apply patch
 bash change_patch.sh

From ef1b28e5d17ceac419de30f8ba129f16444bd39d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 16:18:54 +0800
Subject: [PATCH 15/95] change_build (#21)

---
 backends/metax_gpu/build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index e52cddc6476..a40cac19e19 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -25,6 +25,7 @@ git submodule sync --recursive && git submodule update --init --recursive
 
 
 export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
+export
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/

From 3737e488da962ae43cde4d51e495454a2818eb01 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 16 Sep 2025 16:24:15 +0800
Subject: [PATCH 16/95] change_build (#22)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index a40cac19e19..e3c4304e5f8 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -30,7 +30,6 @@ pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
-# exit 1
 
 unset http_proxy https_proxy
 

From 16f35844e7218d0eb67aaffe6379c2a8820241e7 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Tue, 16 Sep 2025 16:52:30 +0800
Subject: [PATCH 17/95] =?UTF-8?q?=E3=80=90metax=E3=80=91modify=20cmake=20f?=
 =?UTF-8?q?or=20warpctc=20and=20warprnnt=20(#17)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel
---
 backends/metax_gpu/CMakeLists.txt             |  4 +-
 backends/metax_gpu/cmake/warpctc.cmake        |  7 +-
 backends/metax_gpu/cmake/warprnnt.cmake       |  8 ++-
 .../fused_conv2d_add_act_kernel_register.cu   |  2 +-
 .../conv_grad_kernel_register.cu              | 42 ++++++++++--
 .../kernels/gpudnn/conv_kernel_register.cu    |  2 +-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |  2 +-
 backends/metax_gpu/kernels/impl/warpctc.h     | 64 -------------------
 .../kernels/impl/warpctc_grad_kernel_impl.h   |  2 +-
 .../kernels/impl/warpctc_kernel_impl.h        | 16 ++---
 backends/metax_gpu/kernels/impl/warprnnt.h    | 63 ------------------
 .../kernels/impl/warprnnt_kernel_impl.h       | 14 ++--
 backends/metax_gpu/kernels/metax_context.cc   | 20 +++++-
 backends/metax_gpu/kernels/metax_context.h    |  1 +
 14 files changed, 88 insertions(+), 159 deletions(-)
 rename backends/metax_gpu/kernels/{cuda_kernels => gpudnn}/conv_grad_kernel_register.cu (98%)
 delete mode 100644 backends/metax_gpu/kernels/impl/warpctc.h
 delete mode 100644 backends/metax_gpu/kernels/impl/warprnnt.h

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index cca23ab42f5..787aae13e40 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -736,7 +736,7 @@ add_library(
 target_include_directories(
   ${TARGET_NAME}
   PRIVATE ${PADDLE_SOURCE_DIR} ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/kernels
-          ${CUDA_INCLUDE_DIRS} ${PADDLE_SOURCE_DIR}/third_party/pybind/include
+          ${CUDA_INCLUDE_DIRS} ${WARPCTC_INCLUDE_DIR} ${WARPRNNT_INCLUDE_DIR} ${PADDLE_SOURCE_DIR}/third_party/pybind/include
           ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/compat)
 
 target_link_libraries(
@@ -749,6 +749,8 @@ target_link_libraries(
   protobuf
   external_error_proto
   dgc
+  ${WARPCTC_LIBRARIES}
+  ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
 target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
 target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index 71c892a6cfa..9edc92f0a94 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -145,5 +145,8 @@ get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
 include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its
                                             # headers.
 
-add_library(warpctc INTERFACE)
-add_dependencies(warpctc extern_warpctc)
+add_library(warpctc SHARED IMPORTED GLOBAL)
+set_target_properties(warpctc PROPERTIES
+    IMPORTED_LOCATION ${WARPCTC_LIBRARIES}
+    INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR}
+)
\ No newline at end of file
diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake
index 54a7ad6be86..527f2e55a1b 100644
--- a/backends/metax_gpu/cmake/warprnnt.cmake
+++ b/backends/metax_gpu/cmake/warprnnt.cmake
@@ -137,6 +137,8 @@ get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY)
 include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its
                                              # headers.
 
-add_library(warprnnt INTERFACE)
-# set_property(TARGET warprnnt PROPERTY IMPORTED_LOCATION ${WARPRNNT_LIBRARIES})
-add_dependencies(warprnnt extern_warprnnt)
+add_library(warprnnt SHARED IMPORTED GLOBAL)
+set_target_properties(warprnnt PROPERTIES
+    IMPORTED_LOCATION ${WARPRNNT_LIBRARIES}
+    INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR}
+)
\ No newline at end of file
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
index ee4f105cbc5..48809ceefa4 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
@@ -308,7 +308,7 @@ class CudnnConvDescManager {
       int groups,
       cudnnDataType_t dtype) {
     auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
-    desc->set(dtype, paddings, strides, dilations, true, groups);
+    desc->set(dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
     return desc;
   }
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
similarity index 98%
rename from backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
index 885137675b4..e4acb2f95b6 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
@@ -161,7 +161,12 @@ void ConvCudnnGradKernelImplV7(
     args1.idesc.set(*transformed_input_grad, layout_tensor);
     args1.wdesc.set(*transformed_filter_channel, layout_tensor, iwo_groups);
     args1.odesc.set(*transformed_output_grad_channel, layout_tensor);
-    args1.cdesc.set(dtype, padding_common, strides, dilations, true, c_groups);
+    args1.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
 
 #ifdef PADDLE_WITH_HIP
     using search1 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
@@ -184,7 +189,12 @@ void ConvCudnnGradKernelImplV7(
     args2.wdesc.set(
         *transformed_filter_grad_channel, layout_tensor, iwo_groups);
     args2.odesc.set(*transformed_output_grad_channel, layout_tensor);
-    args2.cdesc.set(dtype, padding_common, strides, dilations, true, c_groups);
+    args2.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_groups);
 #ifdef PADDLE_WITH_HIP
     using search2 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
     workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2));
@@ -1073,7 +1083,12 @@ void ConvCudnnGradGradKernel(
       args1.idesc.set(transformed_ddX, iwo_group);
       args1.wdesc.set(*W, layout, iwo_group);
       args1.odesc.set(transformed_ddO_channel, iwo_group);
-      args1.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+      args1.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      phi::AllowTF32Cudnn(),
+                      c_group);
 
 #ifdef PADDLE_WITH_HIP
       using search1 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
@@ -1092,7 +1107,12 @@ void ConvCudnnGradGradKernel(
       args2.idesc.set(transformed_X, iwo_group);
       args2.wdesc.set(*ddW, layout, iwo_group);
       args2.odesc.set(transformed_ddO_channel, iwo_group);
-      args2.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+      args2.cdesc.set(dtype,
+                      padding_common,
+                      strides,
+                      dilations,
+                      phi::AllowTF32Cudnn(),
+                      c_group);
 
 #ifdef PADDLE_WITH_HIP
       using search2 = SearchAlgorithm<miopenConvFwdAlgorithm_t>;
@@ -1114,7 +1134,12 @@ void ConvCudnnGradGradKernel(
     args3.idesc.set(transformed_ddX, iwo_group);
     args3.wdesc.set(*dW, layout, iwo_group);
     args3.odesc.set(transformed_dO_channel, iwo_group);
-    args3.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+    args3.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
 
 #ifdef PADDLE_WITH_HIP
     using search3 = SearchAlgorithm<miopenConvBwdWeightsAlgorithm_t>;
@@ -1136,7 +1161,12 @@ void ConvCudnnGradGradKernel(
     args4.idesc.set(transformed_dX, iwo_group);
     args4.wdesc.set(*ddW, layout, iwo_group);
     args4.odesc.set(transformed_dO_channel, iwo_group);
-    args4.cdesc.set(dtype, padding_common, strides, dilations, true, c_group);
+    args4.cdesc.set(dtype,
+                    padding_common,
+                    strides,
+                    dilations,
+                    phi::AllowTF32Cudnn(),
+                    c_group);
 
 #ifdef PADDLE_WITH_HIP
     using search4 = SearchAlgorithm<miopenConvBwdDataAlgorithm_t>;
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
index bdff5fa9f93..bf129fed05c 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
@@ -81,7 +81,7 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
 #else
-  args.cdesc.set(dtype, padding_common, strides, dilations, true);
+  args.cdesc.set(dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
index aa1cc80d06d..928201c705f 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
@@ -93,7 +93,7 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x,
   args.idesc.set(*transformed_out, iwo_groups);
   args.wdesc.set(*filter, layout_tensor, iwo_groups);
   args.odesc.set(*transformed_x, iwo_groups);
-  args.cdesc.set(dtype, padding_common, strides, dilations_, false, c_groups);
+  args.cdesc.set(dtype, padding_common, strides, dilations_, phi::AllowTF32Cudnn(), c_groups);
 
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
diff --git a/backends/metax_gpu/kernels/impl/warpctc.h b/backends/metax_gpu/kernels/impl/warpctc.h
deleted file mode 100644
index ba5da472ade..00000000000
--- a/backends/metax_gpu/kernels/impl/warpctc.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/common/port.h"
-#include "third_party/warpctc/include/ctc.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag warpctc_dso_flag;
-extern void* warpctc_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warpctc routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using warpctcFunc = decltype(&::__name);                       \
-      std::call_once(warpctc_dso_flag, []() {                        \
-        warpctc_dso_handle = phi::dynload::GetWarpCTCDsoHandle();    \
-      });                                                            \
-      static void* p_##__name = dlsym(warpctc_dso_handle, #__name);  \
-      return reinterpret_cast<warpctcFunc>(p_##__name)(args...);     \
-    }                                                                \
-  };                                                                 \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
-  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
-
-#define WARPCTC_ROUTINE_EACH(__macro) \
-  __macro(get_warpctc_version);       \
-  __macro(ctcGetStatusString);        \
-  __macro(compute_ctc_loss);          \
-  __macro(compute_ctc_loss_double);   \
-  __macro(get_workspace_size);        \
-  __macro(get_workspace_size_double)
-
-WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
-
-#undef DYNAMIC_LOAD_WARPCTC_WRAP
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
index 51f4ce86890..dc9bc376e63 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "kernels/impl/warpctc.h"
+#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
index 9794ba1b3c0..e0b15feca03 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "kernels/impl/warpctc.h"
+#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -58,7 +58,7 @@ class ComputeCtcLossFunctor<Context, float> {
                          float* costs,
                          void* workspace,
                          ctcOptions options) {
-    return phi::dynload::compute_ctc_loss(activations,
+    return compute_ctc_loss(activations,
                                           gradients,
                                           flat_labels,
                                           label_lengths,
@@ -84,7 +84,7 @@ class ComputeCtcLossFunctor<Context, double> {
                          double* costs,
                          void* workspace,
                          ctcOptions options) {
-    return phi::dynload::compute_ctc_loss_double(
+    return compute_ctc_loss_double(
         activations,
         gradients,
         flat_labels,
@@ -141,14 +141,14 @@ class WarpCTCFunctor {
     ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
     if (sizeof(T) == 4) {
       status =
-          phi::dynload::get_workspace_size(cpu_label_lengths,
+          get_workspace_size(cpu_label_lengths,
                                            cpu_input_lengths,
                                            static_cast<int>(sequence_width),
                                            static_cast<int>(num_sequences),
                                            options_,
                                            &workspace_bytes);
     } else {
-      status = phi::dynload::get_workspace_size_double(
+      status = get_workspace_size_double(
           cpu_label_lengths,
           cpu_input_lengths,
           static_cast<int>(sequence_width),
@@ -162,7 +162,7 @@ class WarpCTCFunctor {
         errors::PreconditionNotMet(
             "warp-ctc [version %d] Error in get_workspace_size: %s",
             warpctc_version_,
-            phi::dynload::ctcGetStatusString(status)));
+            ctcGetStatusString(status)));
     PADDLE_ENFORCE_GT(
         workspace_bytes,
         0UL,
@@ -197,12 +197,12 @@ class WarpCTCFunctor {
         errors::PreconditionNotMet(
             "warp-ctc [version %d] Error in get_workspace_size: %s",
             warpctc_version_,
-            phi::dynload::ctcGetStatusString(status)));
+            ctcGetStatusString(status)));
   }
 
  protected:
   void init(const Context& dev_ctx, const size_t blank) {
-    warpctc_version_ = phi::dynload::get_warpctc_version();
+    warpctc_version_ = get_warpctc_version();
 
     if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU ||
         dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
diff --git a/backends/metax_gpu/kernels/impl/warprnnt.h b/backends/metax_gpu/kernels/impl/warprnnt.h
deleted file mode 100644
index 50b0dfc0efc..00000000000
--- a/backends/metax_gpu/kernels/impl/warprnnt.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/common/port.h"
-#include "third_party/warprnnt/include/rnnt.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag warprnnt_dso_flag;
-extern void* warprnnt_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load warprnnt routine
- * via operator overloading.
- */
-#define DYNAMIC_LOAD_WARPRNNT_WRAP(__name)                           \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using warprnntFunc = decltype(&::__name);                      \
-      std::call_once(warprnnt_dso_flag, []() {                       \
-        warprnnt_dso_handle = phi::dynload::GetWarpRNNTDsoHandle();  \
-      });                                                            \
-      static void* p_##__name = dlsym(warprnnt_dso_handle, #__name); \
-      return reinterpret_cast<warprnntFunc>(p_##__name)(args...);    \
-    }                                                                \
-  };                                                                 \
-  extern DynLoad__##__name __name
-
-#define DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP(__name) \
-  DYNAMIC_LOAD_WARPRNNT_WRAP(__name)
-
-#define WARPRNNT_ROUTINE_EACH(__macro) \
-  __macro(get_warprnnt_version);       \
-  __macro(rnntGetStatusString);        \
-  __macro(compute_rnnt_loss);          \
-  __macro(compute_rnnt_loss_fp64);     \
-  __macro(get_rnnt_workspace_size);
-
-WARPRNNT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP);
-
-#undef DYNAMIC_LOAD_WARPRNNT_WRAP
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
index bb4311f5912..457fdcb9bff 100644
--- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 
-#include "kernels/impl/warprnnt.h"
+#include "third_party/warprnnt/include/rnnt.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
@@ -55,7 +55,7 @@ class ComputeRnntLossFunctor<Context, float> {
                           float* costs,
                           void* workspace,
                           rnntOptions options) {
-    return phi::dynload::compute_rnnt_loss(activations,
+    return compute_rnnt_loss(activations,
                                            gradients,
                                            label,
                                            label_lengths,
@@ -81,7 +81,7 @@ class ComputeRnntLossFunctor<Context, double> {
                           double* costs,
                           void* workspace,
                           rnntOptions options) {
-    return phi::dynload::compute_rnnt_loss_fp64(activations,
+    return compute_rnnt_loss_fp64(activations,
                                                 gradients,
                                                 label,
                                                 label_lengths,
@@ -149,7 +149,7 @@ class WarpRNNTFunctor {
     }
 
     size_t workspace_bytes = 0;
-    status = phi::dynload::get_rnnt_workspace_size(
+    status = get_rnnt_workspace_size(
         maxT, maxU, B, gpu, &workspace_bytes, sizeof(T));
 
     PADDLE_ENFORCE_EQ(
@@ -158,7 +158,7 @@ class WarpRNNTFunctor {
         errors::PreconditionNotMet(
             "warp-rnnt [version %d] Error in get_rnnt_workspace_size: %s",
             warprnnt_version_,
-            phi::dynload::rnntGetStatusString(status)));
+            rnntGetStatusString(status)));
     PADDLE_ENFORCE_GT(
         workspace_bytes,
         0UL,
@@ -190,7 +190,7 @@ class WarpRNNTFunctor {
         errors::PreconditionNotMet(
             "warp-rnnt [version %d] Error in get_workspace_size: %s",
             warprnnt_version_,
-            phi::dynload::rnntGetStatusString(status)));
+            rnntGetStatusString(status)));
   }
 
  protected:
@@ -200,7 +200,7 @@ class WarpRNNTFunctor {
             const size_t blank,
             const float fastemit_lambda,
             const int num_threads) {
-    warprnnt_version_ = phi::dynload::get_warprnnt_version();
+    warprnnt_version_ = get_warprnnt_version();
 
     options_.maxT = maxT;
     options_.maxU = maxU;
diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_context.cc
index 4df4d88b0b4..f0c92f00565 100644
--- a/backends/metax_gpu/kernels/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_context.cc
@@ -15,7 +15,25 @@
 #include "kernels/metax_context.h"
 
 namespace phi {
-bool AllowTF32Cudnn() { return false; }
+const bool allow_tf32_cublas = []() -> bool {
+    const char* v = std::getenv("ALLOW_TF32_CUBLAS");
+    if (v) {
+      return std::atoi(v);
+    }
+    return false;
+}();
+
+const bool allow_tf32_cudnn = []() -> bool {
+    const char* v = std::getenv("ALLOW_TF32_CUDNN");
+    if (v) {
+      return std::atoi(v);
+    }
+    return false;
+}();
+
+bool AllowTF32Cublas() { return allow_tf32_cublas; }
+bool AllowTF32Cudnn()  { return allow_tf32_cudnn;  }
+
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_context.h
index 5974aadcc41..683a6df7017 100644
--- a/backends/metax_gpu/kernels/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_context.h
@@ -128,6 +128,7 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle,
   }
 }
 
+bool AllowTF32Cublas();
 bool AllowTF32Cudnn();
 inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) {
   std::call_once(flag_cusolver_dn_, [&]() {

From ce54693240221505b150900fb601e640181a5620 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Tue, 16 Sep 2025 18:12:37 +0800
Subject: [PATCH 18/95] [metax]modify library to static library (#24)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library
---
 backends/metax_gpu/cmake/warpctc.cmake  | 19 +++++++++----------
 backends/metax_gpu/cmake/warprnnt.cmake | 19 +++++++++----------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index 9edc92f0a94..0733c0f9ce5 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -66,11 +66,11 @@ set(WARPCTC_LIB_DIR
 
 if(WIN32)
   set(WARPCTC_LIBRARIES
-      "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-ctc Library" FORCE)
 else()
   set(WARPCTC_LIBRARIES
-      "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-ctc Library" FORCE)
 endif()
 
@@ -93,10 +93,10 @@ if(WIN32)
   set(WARPCTC_CXX_FLAGS_DEBUG
       $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
 else()
-  set(WARPCTC_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPCTC_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
   set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
   set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-  set(WARPCTC_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPCTC_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
   set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
   set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
@@ -127,7 +127,7 @@ ExternalProject_Add(
              -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
              -DWITH_TORCH=OFF
              -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-             -DBUILD_SHARED=ON
+             -DBUILD_SHARED=OFF
              -DBUILD_TESTS=OFF
              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
@@ -145,8 +145,7 @@ get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY)
 include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its
                                             # headers.
 
-add_library(warpctc SHARED IMPORTED GLOBAL)
-set_target_properties(warpctc PROPERTIES
-    IMPORTED_LOCATION ${WARPCTC_LIBRARIES}
-    INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR}
-)
\ No newline at end of file
+add_library(warpctc STATIC IMPORTED GLOBAL)
+set_target_properties(
+  warpctc PROPERTIES IMPORTED_LOCATION ${WARPCTC_LIBRARIES}
+                     INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR})
diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake
index 527f2e55a1b..a8d6683af2b 100644
--- a/backends/metax_gpu/cmake/warprnnt.cmake
+++ b/backends/metax_gpu/cmake/warprnnt.cmake
@@ -62,11 +62,11 @@ set(WARPRNNT_LIB_DIR
 
 if(WIN32)
   set(WARPRNNT_LIBRARIES
-      "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-rnnt Library" FORCE)
 else()
   set(WARPRNNT_LIBRARIES
-      "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_SHARED_LIBRARY_SUFFIX}"
+      "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_STATIC_LIBRARY_SUFFIX}"
       CACHE FILEPATH "Warp-rnnt Library" FORCE)
 endif()
 
@@ -90,10 +90,10 @@ if(WIN32)
   set(WARPRNNT_CXX_FLAGS_DEBUG
       $<FILTER:${CMAKE_CXX_FLAGS_DEBUG},EXCLUDE,/Zc:inline>)
 else()
-  set(WARPRNNT_C_FLAGS ${CMAKE_C_FLAGS})
+  set(WARPRNNT_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
   set(WARPRNNT_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG})
   set(WARPRNNT_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE})
-  set(WARPRNNT_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+  set(WARPRNNT_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
   set(WARPRNNT_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE})
   set(WARPRNNT_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
@@ -120,7 +120,7 @@ ExternalProject_Add(
              -DWITH_ROCM=${WITH_ROCM}
              -DWITH_OMP=${USE_OMP}
              -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA}
-             -DBUILD_SHARED=ON
+             -DBUILD_SHARED=OFF
              -DBUILD_TESTS=OFF
              -DCMAKE_POSITION_INDEPENDENT_CODE=ON
              -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
@@ -137,8 +137,7 @@ get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY)
 include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its
                                              # headers.
 
-add_library(warprnnt SHARED IMPORTED GLOBAL)
-set_target_properties(warprnnt PROPERTIES
-    IMPORTED_LOCATION ${WARPRNNT_LIBRARIES}
-    INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR}
-)
\ No newline at end of file
+add_library(warprnnt STATIC IMPORTED GLOBAL)
+set_target_properties(
+  warprnnt PROPERTIES IMPORTED_LOCATION ${WARPRNNT_LIBRARIES}
+                      INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR})

From 4cda637ff68d88adfd88c322d4d55c9d7dd15397 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 16 Sep 2025 18:14:09 +0800
Subject: [PATCH 19/95] [Metax] organize documents (#25)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents
---
 .../calc_reduced_attn_kernel_register.cu      |   2 +-
 backends/metax_gpu/kernels/funcs/softmax.cu   |   2 +-
 .../kernels/funcs/values_vectors_functor.h    |   2 +-
 .../metax_gpu/kernels/gpudnn/conv_cudnn_v7.h  |   2 +-
 .../conv_transpose_grad_kernel_register.cu    |   2 +-
 .../kernels/gpudnn/pool_kernel_register.cu    |   2 +-
 .../metax_gpu/kernels/gpudnn/softmax_gpudnn.h |   2 +-
 .../kernels/impl/dirichlet_kernel_impl.h      |   2 +-
 .../addmm_grad_kernel_register.cu             |   0
 .../addmm_kernel_register.cu                  |   0
 .../batch_fc_grad_kernel_register.cu          |   0
 .../batch_norm_grad_kernel_register.cu        |   2 +-
 .../batch_norm_kernel_register.cu             |   0
 .../bilinear_grad_kernel_register.cu          |   0
 .../bilinear_kernel_register.cu               |   0
 .../metax_kernel/blha_get_max_len_register.cu |   2 +-
 .../bmm_grad_kernel_register.cu               |   0
 .../bmm_kernel_register.cu                    |   0
 ...abel_cross_entropy_grad_kernel_register.cu |   0
 .../cholesky_grad_kernel_register.cu          |   0
 .../metax_kernel/cholesky_kernel_register.cu  |   2 +-
 .../conv_kernel_register.cu                   |   0
 .../conv_transpose_kernel_register.cu         |   0
 .../crop_kernel_register.cu                   |   0
 .../cross_entropy_kernel_register.cu          |   2 +-
 .../depthwise_conv_grad_kernel.cu             |   0
 .../depthwise_conv_kernel.cu                  |   0
 .../kernels/{ => metax_kernel}/elementwise.h  |   0
 .../{ => metax_kernel}/flags_declare.cu       |   0
 .../flash_attn_grad_kernel.cu                 |   0
 .../{ => metax_kernel}/flash_attn_kernel.cu   |   0
 .../{ => metax_kernel}/flash_attn_kernel.h    |   0
 .../{ => metax_kernel}/flash_attn_utils.h     |   0
 .../kernels/{ => metax_kernel}/flashattn.cc   |   0
 .../kernels/{ => metax_kernel}/flashattn.h    |   0
 .../flatten2_grad_kernel_register.cu          |   0
 .../flatten2_kernel_register.cu               |   0
 .../fused_conv2d_add_act_kernel_register.cu   |   3 +-
 .../fused_rope_grad_kernel_register.cu        |   0
 .../fused_rope_kernel_register.cu             |   0
 .../instance_norm_grad_kerne_registerl.cu     |   2 +-
 .../instance_norm_kernel_register.cu          |   2 +-
 .../layer_norm_grad_kernel_register.cu        |   0
 .../layer_norm_kernel_register.cu             |   0
 .../lstm_kernel_register.cu                   |   0
 .../metax_kernel/lu_kernel_register.cu        |   2 +-
 .../lu_solve_grad_kernel_register.cu          |   0
 .../metax_kernel/matrix_rank_tol_kernel.cu    |   2 +-
 .../{ => metax_kernel}/metax_context.cc       |  24 +--
 .../{ => metax_kernel}/metax_context.h        |   6 +-
 .../multi_dot_grad_kernel_register.cu         |   0
 .../multi_dot_kernel_register.cu              |   0
 .../mv_grad_kernel_register.cu                |   0
 .../mv_kernel_register.cu                     |   0
 .../metax_kernel/qr_kernel_register.cu        |   2 +-
 .../rank_attention_grad_kernel_register.cu    |   0
 .../rank_attention_kernel_register.cu         |   0
 .../metax_kernel/rnn_grad_kernel.cu.cc        |   2 +-
 .../kernels/metax_kernel/rnn_kernel.cu.cc     |   2 +-
 .../slogdeterminant_kernel_register.cu        |   0
 .../softmax_kernel_grad_register.cu           |   0
 .../softmax_kernel_register.cu                |   0
 .../solve_grad_kernel_register.cu             |   0
 .../standard_gamma_kernel_register.cu         |   0
 .../stft_kernel_register.cu                   |   0
 .../svd_kernel_register.cu                    |   0
 .../top_k_grad_kernel_register.cu             |   0
 .../triangular_solve_grad_kernel_register.cu  |   0
 .../triangular_solve_kernel_register.cu       |   0
 .../warprnnt_kernel_register.cu               |   0
 .../weight_only_linear_kernel.cu              |   0
 .../weight_quantize_kernel_register.cu        |   0
 backends/metax_gpu/patch/paddle.patch         | 204 +++++++++---------
 backends/metax_gpu/tests/CMakeLists.txt       |  54 ++---
 74 files changed, 166 insertions(+), 163 deletions(-)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_fc_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_norm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/cholesky_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_transpose_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/crop_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_grad_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/elementwise.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flags_declare.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_grad_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_utils.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.cc (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.h (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_conv2d_add_act_kernel_register.cu (99%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lstm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lu_solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.cc (90%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.h (96%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/slogdeterminant_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_grad_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/standard_gamma_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/stft_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/svd_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/top_k_grad_kernel_register.cu (100%)
 mode change 100755 => 100644
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/warprnnt_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_only_linear_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_quantize_kernel_register.cu (100%)

diff --git a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
index 11def2c9ee4..2aa8424f0b1 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/flash_attn_utils.h"
+#include "kernels/metax_kernel/flash_attn_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/calc_reduced_attn_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu
index d738a53f43a..44bfd02a308 100644
--- a/backends/metax_gpu/kernels/funcs/softmax.cu
+++ b/backends/metax_gpu/kernels/funcs/softmax.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
index ec429950872..8c5996e680b 100644
--- a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
+++ b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
@@ -24,7 +24,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/common/errors.h"
 #endif
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
index da61a1e5b41..a0f89047045 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
+++ b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "glog/logging.h"
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
index 0067818d165..b7eebfcee2e 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 
 #include "kernels/gpudnn/conv_cudnn_v7.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/ddim.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
index c115f5ad930..1c2bfeedf34 100644
--- a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gpudnn/pool_gpudnn.h"
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
index 168752700e9..5844886ad1b 100644
--- a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
+++ b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
@@ -25,7 +25,7 @@
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 
diff --git a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
index 70af87513e5..c2e2e341bf5 100644
--- a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
@@ -17,7 +17,7 @@
 #include <cmath>
 #include <random>
 
-#include "kernels/elementwise.h"
+#include "kernels/metax_kernel/elementwise.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
index 062646bbf9d..52fe5a1d566 100644
--- a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/flags.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
index bc9eb23c0e8..42810569fde 100644
--- a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/flash_attn_utils.h"
 #include "kernels/metax_kernel/block_attn.h"
+#include "kernels/metax_kernel/flash_attn_utils.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
index e8fae2d9da5..8a39ae3f0a8 100644
--- a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
index e94862ec7b0..043a64dc149 100644
--- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/kernels/cross_entropy_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
diff --git a/backends/metax_gpu/kernels/elementwise.h b/backends/metax_gpu/kernels/metax_kernel/elementwise.h
similarity index 100%
rename from backends/metax_gpu/kernels/elementwise.h
rename to backends/metax_gpu/kernels/metax_kernel/elementwise.h
diff --git a/backends/metax_gpu/kernels/flags_declare.cu b/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flags_declare.cu
rename to backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_grad_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_kernel.h
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h
diff --git a/backends/metax_gpu/kernels/flash_attn_utils.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_utils.h
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h
diff --git a/backends/metax_gpu/kernels/flashattn.cc b/backends/metax_gpu/kernels/metax_kernel/flashattn.cc
similarity index 100%
rename from backends/metax_gpu/kernels/flashattn.cc
rename to backends/metax_gpu/kernels/metax_kernel/flashattn.cc
diff --git a/backends/metax_gpu/kernels/flashattn.h b/backends/metax_gpu/kernels/metax_kernel/flashattn.h
similarity index 100%
rename from backends/metax_gpu/kernels/flashattn.h
rename to backends/metax_gpu/kernels/metax_kernel/flashattn.h
diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
similarity index 99%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
index 48809ceefa4..c0d15b7f1b4 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
@@ -308,7 +308,8 @@ class CudnnConvDescManager {
       int groups,
       cudnnDataType_t dtype) {
     auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
-    desc->set(dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
+    desc->set(
+        dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
     return desc;
   }
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
index d7540d949a9..bdf341f5a35 100644
--- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
index db975d74665..e0c0ae9c1d6 100644
--- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/layer_norm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
index 5a2d85418a1..72e4c5b2b79 100644
--- a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/backends/dynload/cusolver.h"
 #endif
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
index bda5dc62f1a..d8c3355e6e4 100644
--- a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
@@ -18,7 +18,7 @@
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/type_traits.h"
diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
similarity index 90%
rename from backends/metax_gpu/kernels/metax_context.cc
rename to backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index f0c92f00565..62aaa5fb2de 100644
--- a/backends/metax_gpu/kernels/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -12,27 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
 const bool allow_tf32_cublas = []() -> bool {
-    const char* v = std::getenv("ALLOW_TF32_CUBLAS");
-    if (v) {
-      return std::atoi(v);
-    }
-    return false;
+  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
+  if (v) {
+    return std::atoi(v);
+  }
+  return false;
 }();
 
 const bool allow_tf32_cudnn = []() -> bool {
-    const char* v = std::getenv("ALLOW_TF32_CUDNN");
-    if (v) {
-      return std::atoi(v);
-    }
-    return false;
+  const char* v = std::getenv("ALLOW_TF32_CUDNN");
+  if (v) {
+    return std::atoi(v);
+  }
+  return false;
 }();
 
 bool AllowTF32Cublas() { return allow_tf32_cublas; }
-bool AllowTF32Cudnn()  { return allow_tf32_cudnn;  }
+bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
similarity index 96%
rename from backends/metax_gpu/kernels/metax_context.h
rename to backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 683a6df7017..a6610c1dab2 100644
--- a/backends/metax_gpu/kernels/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
-#define BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
+#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
+#define BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
 #include <array>
 #include <functional>
 #include <mutex>
@@ -161,4 +161,4 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor,
   return DnnWorkspaceHandle(alloactor, stream);
 }
 }  // namespace phi
-#endif  // BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
+#endif  // BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
index 745069e2eda..c3041254444 100644
--- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
@@ -22,7 +22,7 @@
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
index 499832049e4..101b51aa350 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/rnn_grad_kernel.h"
 
-#include "kernels/metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
index f1cf9e09dc7..2598ce093e6 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/rnn_kernel.h"
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu
old mode 100755
new mode 100644
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 0283a443adb..e56826c4f3e 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644
 -  set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
 +  # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
  endif()
- 
+
  set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
 diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
 index bff0f2bf70..9376b5781f 100644
 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc
 +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
 @@ -16,7 +16,7 @@
- 
+
  #include <cstdio>
- 
+
 -#include "paddle/fluid/platform/enforce.h"
 +// #include "paddle/fluid/platform/enforce.h"
  #include "paddle/phi/core/os_info.h"
@@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnDestroyActivationDescriptor);               \
 +  __macro(cudnnSetRNNDescriptor_v6);
  CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- 
+
  #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
@@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnRNNForwardInferenceEx);
  CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
- 
+
 @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
- 
+
 -#if CUDNN_VERSION < 90000
 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
 -  __macro(cudnnGetRNNParamsSize);                     \
@@ -132,15 +132,15 @@ index c0080f0a5e..458ca3e2e8 100644
 -#endif
  }  // namespace dynload
  }  // namespace phi
- 
+
 diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
 index 1547909d92..66b2779392 100644
 --- a/paddle/phi/backends/dynload/cufft.h
 +++ b/paddle/phi/backends/dynload/cufft.h
 @@ -1,3 +1,4 @@
-+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
          cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
@@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644
 @@ -24,8 +24,8 @@ limitations under the License. */
  #include "paddle/phi/backends/dynload/dynamic_loader.h"
  #include "paddle/phi/common/port.h"
- 
+
 -namespace phi {
 -namespace dynload {
 +// namespace phi {
 +// namespace dynload {
- 
+
  extern std::once_flag cupti_dso_flag;
  extern void *cupti_dso_handle;
 @@ -71,7 +71,7 @@ extern void *cupti_dso_handle;
  CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
- 
+
  #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
 -}  // namespace dynload
 -}  // namespace phi
 +// }  // namespace dynload
 +// }  // namespace phi
- 
+
 -#endif  // PADDLE_WITH_CUPTI
 +#endif  // PADDLE_WITH_CUPTI
 \ No newline at end of file
@@ -230,28 +230,28 @@ index 4ff2e528a9..81421c8ca1 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 @@ -1,3 +1,4 @@
-+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -25,7 +26,7 @@ namespace phi {
  namespace backends {
  namespace gpu {
- 
+
 -#define FULL_WARP_MASK 0xFFFFFFFF
 +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
  #define CREATE_SHFL_MASK(mask, predicate) \
    mask = __ballot_sync(FULL_WARP_MASK, (predicate))
- 
+
 @@ -45,12 +46,12 @@ namespace gpu {
- 
+
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
 +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) {
    return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
  }
- 
+
  template <typename T>
 -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
 +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask,
@@ -259,7 +259,7 @@ index 4ff2e528a9..81421c8ca1 100644
                                                  int width = warpSize) {
    return __shfl_xor_sync(mask, val, width);
 @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::float16 val, int delta, int width) {
@@ -267,7 +267,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::float16(__shfl_down_sync(
        mask, val.to_half(), static_cast<unsigned>(delta), width));
  }
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
@@ -276,7 +276,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(__shfl_down_sync(
        mask, val.to_nv_bfloat16(), static_cast<unsigned>(delta), width));
 @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
@@ -285,7 +285,7 @@ index 4ff2e528a9..81421c8ca1 100644
        mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
    float imag = static_cast<float>(__shfl_down_sync(
 @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
@@ -294,14 +294,14 @@ index 4ff2e528a9..81421c8ca1 100644
        static_cast<double>(__shfl_down_sync(mask,
                                             static_cast<double>(val.real),
 @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::float16 val, int width) {
 +    unsigned long long mask, phi::dtype::float16 val, int width) {
    return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
  }
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int width) {
@@ -310,7 +310,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(
        __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
 @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<float> val, int width) {
@@ -319,7 +319,7 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<float>(val.real), width));
    float imag = static_cast<float>(
 @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<double> val, int width) {
@@ -328,14 +328,14 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<double>(val.real), width));
    double imag = static_cast<double>(
 @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
- 
+
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
 +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) {
    return __shfl_sync(mask, val, src_line, width);
  }
- 
+
 @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // but most card's warp size is 32.
    const int warpSize = 32;
@@ -343,7 +343,7 @@ index 4ff2e528a9..81421c8ca1 100644
 -  unsigned mask = 0u;
 +  unsigned long long mask = 0ull;
    CREATE_SHFL_MASK(mask, tid < len);
- 
+
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
 index 024a7de73e..1e4cdf16be 100644
@@ -351,7 +351,7 @@ index 024a7de73e..1e4cdf16be 100644
 +++ b/paddle/phi/core/enforce.h
 @@ -45,7 +45,9 @@ limitations under the License. */
  #endif
- 
+
  #ifdef PADDLE_WITH_CUDA
 -#include "paddle/phi/backends/dynload/cublas.h"
 +// #include "paddle/phi/backends/dynload/../../../../../cublas.h"
@@ -361,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644
  #include "paddle/phi/backends/dynload/curand.h"
  #include "paddle/phi/backends/dynload/cusolver.h"
 @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; }
- 
+
  void ThrowWarnInternal(const std::string& message);
- 
+
 -#if defined(__CUDA_ARCH__)
 +#if defined(__CUDACC__)
  // For cuda, the assertions can affect performance and it is therefore
@@ -379,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644
    } while (0)
  #elif defined(__HIPCC__)
 @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) {
- 
+
  }  // namespace enforce
  using namespace enforce;  // NOLINT
 -}  // namespace phi
@@ -392,7 +392,7 @@ index c646e487d0..325122175c 100644
 @@ -25,8 +25,9 @@
  #else
  #include <cuda_runtime.h>
- 
+
 -#include "paddle/phi/backends/dynload/cublas.h"
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublas.h"
@@ -400,16 +400,16 @@ index c646e487d0..325122175c 100644
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #include "paddle/phi/backends/dynload/cudnn.h"
  #endif
- 
+
 @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
- 
+
  // TODO(Ming Huang): Since there is no blasLt handler,
  // use rocblas_handle for workaround.
 -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
- 
+
  #undef DECLARE_TYPE_FOR_GPU
- 
+
 diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h
 index 2d02eb370b..8a7233e34e 100644
 --- a/paddle/phi/core/platform/device_context.h
@@ -430,58 +430,58 @@ index d69eb67d6f..1d8b6e9375 100644
 --- a/paddle/phi/kernels/cpu/index_select_impl.h
 +++ b/paddle/phi/kernels/cpu/index_select_impl.h
 @@ -18,7 +18,7 @@
- 
+
  #include "paddle/phi/core/dense_tensor.h"
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
- 
+
  #include "paddle/phi/backends/all_context.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/fc_functor.h"
- 
+
  #include "paddle/phi/backends/gpu/gpu_launch_config.h"
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
 index 88663ec880..98b93072a3 100644
 --- a/paddle/phi/kernels/funcs/gru_compute.cu
 +++ b/paddle/phi/kernels/funcs/gru_compute.cu
 @@ -12,7 +12,7 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/gru_compute.h"
- 
+
  #include "paddle/phi/backends/gpu/gpu_context.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h
 +++ b/paddle/phi/kernels/funcs/math/context_project.h
 @@ -18,7 +18,7 @@
  #include <vector>
- 
+
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/im2col.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
@@ -489,14 +489,14 @@ index e101224970..a52eb6096f 100644
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
  #include "paddle/phi/common/memory_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
- 
+
  namespace phi {
  namespace funcs {
- 
+
 +
 +
  template <typename Context, typename T>
@@ -514,19 +514,19 @@ index 558d363b39..05da04b517 100644
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 index 8b0baf5f5f..260482f124 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;
- 
+
  #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
- 
+
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..3c74792690 100644
@@ -535,7 +535,7 @@ index e30d440ff3..3c74792690 100644
 @@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
- 
+
 -#define FINAL_MASK 0xffffffff
 +#define FINAL_MASK 0xffffffffffffffffull
  #ifdef PADDLE_WITH_HIP
@@ -545,7 +545,7 @@ index e30d440ff3..3c74792690 100644
 +#define WARP_SIZE 64
  #endif
  #define MAX_NUM_THREADS 1024
- 
+
 @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
@@ -606,7 +606,7 @@ index e30d440ff3..3c74792690 100644
 +  topk[0 + offset].v = p.v;
 +  topk[0 + offset].id = p.id;
  }
- 
+
  template <typename T, int BlockSize>
 @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
@@ -662,7 +662,7 @@ index e30d440ff3..3c74792690 100644
 +            // topk + MaxLength - *beam, src, tid, dim, *max, length, largest);
        }
      }
- 
+
 @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
@@ -697,7 +697,7 @@ index e30d440ff3..3c74792690 100644
 -    if (--(*k) == 0) break;
 +    // if (--(*k) == 0) break;
 +    unsigned long long mask = 0ull;
- 
+
 -    unsigned mask = 0u;
 +    // unsigned mask = 0u;
      CREATE_SHFL_MASK(mask, true);
@@ -721,14 +721,14 @@ index e30d440ff3..3c74792690 100644
 +
      return ret;
    }
- 
+
    static __device__ __forceinline__ unsigned int SetBitfield(
        unsigned int val, unsigned int to_insert, int pos, int len) {
      unsigned int ret;
 -    asm("bfi.b32 %0, %1, %2, %3, %4;"
 -        : "=r"(ret)
 -        : "r"(to_insert), "r"(val), "r"(pos), "r"(len));
-+    
++
 +    ret = (static_cast<unsigned int>(val) << (32 - pos - len)) >> (32 - len);
      return ret;
    }
@@ -738,12 +738,12 @@ index e30d440ff3..3c74792690 100644
                                                           int len) {
      uint64_t ret;
 -    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
-+    
++
 +
 +    ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
      return ret;
    }
- 
+
 @@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
@@ -751,7 +751,7 @@ index e30d440ff3..3c74792690 100644
 -    asm("bfi.b64 %0, %1, %2, %3, %4;"
 -        : "=l"(ret)
 -        : "l"(to_insert), "l"(val), "r"(pos), "r"(len));
-+    
++
 +  ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
 +
      return ret;
@@ -763,7 +763,7 @@ index e30d440ff3..3c74792690 100644
    int lane_id;
 -  asm("mov.s32 %0, %%laneid;" : "=r"(lane_id));
 -  return lane_id;
-+  
++
 +// // >>>> PTX2CPP Success <<<<
 +// {
 +// (lane_id)=(threadIdx.x&(warpSize-1));
@@ -771,7 +771,7 @@ index e30d440ff3..3c74792690 100644
 +  return ::__lane_id();
 +  // return lane_id;
  }
- 
+
  __device__ __forceinline__ unsigned GetLaneMaskLe() {
    unsigned mask;
 -  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
@@ -780,17 +780,17 @@ index e30d440ff3..3c74792690 100644
 +  return ((uint64_t(1) << ::__lane_id()) << 1) - 1;
 +  // return mask;
  }
- 
+
  template <typename T, bool KillDependency, class Function>
 @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
- 
+
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
 -  RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  // RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  RadixSearch<T, typename RadixTypeConfig<T>::RadixType, IndexType, false>(
        cur_input, k, num_cols, shared_mem, &kth_value);
- 
+
    __shared__ int64_t block_min_idx;
 @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
@@ -803,12 +803,12 @@ index 32db61532f..0220316bc3 100644
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
  #pragma once
- 
+
  #if defined(PADDLE_WITH_CUDA)
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #endif
- 
+
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 index 9d4bb18d55..ea42cc10a9 100644
@@ -830,12 +830,12 @@ index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 @@ -14,7 +14,7 @@
- 
+
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
- 
+
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
@@ -843,12 +843,12 @@ index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 @@ -14,7 +14,7 @@
- 
+
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
- 
+
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -863,7 +863,7 @@ index f0cca0f701..02ea957240 100644
 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 +#include "kernels/gpudnn/conv_gpudnn.h"
 +#include "kernels/impl/conv_cudnn_impl.h"
- 
+
  namespace phi {
  // To determine use cudnn or not.
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -890,7 +890,7 @@ index 29fa252e96..4ae72b0935 100644
 +// #endif
    return tanhf(x);
  }
- 
+
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -901,9 +901,9 @@ index 11efd87965..679db14c24 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
- 
+
  namespace phi {
- 
+
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -914,9 +914,9 @@ index 63c35dd4ee..15da9aea45 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
- 
+
  namespace phi {
- 
+
 diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
 index 1bdbe1564c..f753b54bc6 100644
 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -948,7 +948,7 @@ index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
- 
+
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_grad_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -961,14 +961,14 @@ index 2789cb59a2..b91b076f7f 100644
 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 @@ -20,7 +20,7 @@ limitations under the License. */
- 
+
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
- 
+
 diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
 index 9a21c23666..86413d1577 100644
 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -993,7 +993,7 @@ index 4459a931da..837c8682b8 100644
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
 index ad9e9197dd..5478d9817d 100644
@@ -1013,27 +1013,27 @@ index e6b3960f6d..564125f1f6 100644
 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) {
- 
+
  template <typename T>
  HOSTDEVICE T igamc(const T a, const T x) {
 -  static T big = 4.503599627370496e15;
 -  static T biginv = 2.22044604925031308085e-16;
 +  const static T big = 4.503599627370496e15;
 +  const static T biginv = 2.22044604925031308085e-16;
- 
+
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
- 
+
 diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 index 410fb3c560..009ce03440 100644
 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
- 
+
  template <typename T>
  HOSTDEVICE T digamma(T x) {
 -  static T pi = T{3.14159265358979323846};
 +  const static T pi = T{3.14159265358979323846};
- 
+
    if (x == T{0.0}) {
      T inf = std::numeric_limits<T>::infinity();
 diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
@@ -1048,12 +1048,12 @@ index 5ebbc8d2db..48acf8d0cd 100644
 -#include "paddle/phi/kernels/funcs/quant_dequant.h"
 +#include "kernels/funcs/blas/cublaslt.h"
 +#include "kernels/funcs/quant_dequant.h"
-+#include "kernels/metax_context.h"
- 
++#include "kernels/metax_kernel/metax_context.h"
+
  #pragma once
- 
+
 @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx,
- 
+
    {
      auto helper =
 -        std::make_unique<CublasLtHelper>(m, k, n, dev_ctx.cublaslt_handle());
@@ -1067,12 +1067,12 @@ index 1f319c4ae3..9186eb6906 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
- 
+
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 index 6f03f76eeb..5fe2c3e7dc 100644
@@ -1080,13 +1080,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
- 
+
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
 diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
 index 7b85903776..3f4b298807 100644
 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -1118,14 +1118,14 @@ index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 @@ -14,7 +14,7 @@
- 
+
  #pragma once
- 
+
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
- 
+
 diff --git a/third_party/flagcx b/third_party/flagcx
 index 7c469f4af9..7e6c4cc3ca 160000
 --- a/third_party/flagcx
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 37475773026..410ef006514 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -87,32 +87,34 @@ list(
 list(
   REMOVE_ITEM
   PYTHON_TEST_SCRIPTS
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py # 受 test_sum_op.py 影响
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py # core.cudnnversion
-                                                                   # 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py # op_test.py 里
-                                                        # self._get_places()
-                                                        # 接口适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py # device == "gpu" 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py # needs check_grad with fp64
-                                                    # precision
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # op_test.py 里
-                                                   # self._get_places() 接口适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py # paddle.device.cuda.get_device_properties
-)
+  # 精度问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+  # core.cudnnversion
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+  # op_test.py 里 self._get_places()接口适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
+  # device == "gpu" 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+  # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+  # paddle.device.cuda.get_device_properties
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
+  # needs check_grad with fp64 precision
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+  # CUDAPinnedPlace 问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 foreach(test_script ${PYTHON_TEST_SCRIPTS})

From 23fca59cd47c30680a01e9ec79f5d4d16d156320 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 10:44:44 +0800
Subject: [PATCH 20/95] [metax]fix_code style and index_elementwise_put_kernel
 (#27)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/CMakeLists.txt             | 15 +++--
 ...ex_elementwise_put_grad_kernel_register.cu | 18 ++++-
 .../index_elementwise_put_kernel_register.cu  | 18 ++++-
 .../kernels/gpudnn/conv_kernel_register.cu    |  3 +-
 .../kernels/gpudnn/conv_transpose_kernel.cu   |  7 +-
 .../kernels/impl/warpctc_grad_kernel_impl.h   |  2 +-
 .../kernels/impl/warpctc_kernel_impl.h        | 67 +++++++++----------
 .../kernels/impl/warprnnt_kernel_impl.h       | 39 +++++------
 8 files changed, 103 insertions(+), 66 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 787aae13e40..f282a9fbf7c 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -666,7 +666,6 @@ file(
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/shape_kernel.cc
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/conv_kernel_igemm.cu
   # ############################################################################
-  # kernels/fusion kernels/selected_rows
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
   # kernels/kps
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -713,10 +712,7 @@ file(
   kernels/cuda_kernels/*.cc
   kernels/cuda_kernels/*.cu
   kernels/funcs/blas/*.cc
-  kernels/ernie_core/*.cu
-  kernels/ernie_core/rms_norm_kernel_register.cu
-  kernels/ernie_core/top_p_sampling_kernel_register.cu
-  kernels/ernie_core/fused_bias_act_kernel_register.cu)
+  kernels/ernie_core/*.cu)
 
 set(CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS})
 
@@ -735,8 +731,13 @@ add_library(
 
 target_include_directories(
   ${TARGET_NAME}
-  PRIVATE ${PADDLE_SOURCE_DIR} ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/kernels
-          ${CUDA_INCLUDE_DIRS} ${WARPCTC_INCLUDE_DIR} ${WARPRNNT_INCLUDE_DIR} ${PADDLE_SOURCE_DIR}/third_party/pybind/include
+  PRIVATE ${PADDLE_SOURCE_DIR}
+          ${CMAKE_SOURCE_DIR}
+          ${CMAKE_SOURCE_DIR}/kernels
+          ${CUDA_INCLUDE_DIRS}
+          ${WARPCTC_INCLUDE_DIR}
+          ${WARPRNNT_INCLUDE_DIR}
+          ${PADDLE_SOURCE_DIR}/third_party/pybind/include
           ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/compat)
 
 target_link_libraries(
diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu
index c8d69cecae1..f935014d17b 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu"  //NOLINT
 #include "paddle/phi/kernels/index_elementwise_put_grad_kernel.h"
-
 PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -31,3 +31,19 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_grad,
                           phi::dtype::bfloat16,
                           phi::dtype::complex<float>,
                           phi::dtype::complex<double>) {}
+PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_with_tensor_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::IndexElementwisePutWithTensorGradKernel,
+                          bool,
+                          float,
+                          double,
+                          int,
+                          int8_t,
+                          int64_t,
+                          int16_t,
+                          uint8_t,
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu
index 391dd908a8d..533204b8102 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu"  //NOLINT
 #include "paddle/phi/kernels/index_elementwise_put_kernel.h"
-
 PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -31,3 +31,19 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put,
                           phi::dtype::bfloat16,
                           phi::dtype::complex<float>,
                           phi::dtype::complex<double>) {}
+PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_with_tensor,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::IndexElementwisePutWithTensorKernel,
+                          bool,
+                          float,
+                          double,
+                          int,
+                          int8_t,
+                          int64_t,
+                          int16_t,
+                          uint8_t,
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
index bf129fed05c..0a83b504c76 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
@@ -81,7 +81,8 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input,
   args.cdesc.set(
       dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups);
 #else
-  args.cdesc.set(dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
+  args.cdesc.set(
+      dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn());
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1)
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
index 928201c705f..532b7af0db4 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
@@ -93,7 +93,12 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x,
   args.idesc.set(*transformed_out, iwo_groups);
   args.wdesc.set(*filter, layout_tensor, iwo_groups);
   args.odesc.set(*transformed_x, iwo_groups);
-  args.cdesc.set(dtype, padding_common, strides, dilations_, phi::AllowTF32Cudnn(), c_groups);
+  args.cdesc.set(dtype,
+                 padding_common,
+                 strides,
+                 dilations_,
+                 phi::AllowTF32Cudnn(),
+                 c_groups);
 
 #ifdef PADDLE_WITH_HIP
   SearchResult<miopenConvBwdDataAlgorithm_t> bwd_result;
diff --git a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
index dc9bc376e63..16b740d5523 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h
@@ -16,7 +16,6 @@
 
 #include <vector>
 
-#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
@@ -24,6 +23,7 @@
 #include "paddle/phi/kernels/funcs/sequence_padding.h"
 #include "paddle/phi/kernels/funcs/sequence_scale.h"
 #include "paddle/utils/optional.h"
+#include "third_party/warpctc/include/ctc.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
index e0b15feca03..cb39a0171ba 100644
--- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h
@@ -16,7 +16,6 @@
 
 #include <vector>
 
-#include "third_party/warpctc/include/ctc.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/phi/core/tensor_utils.h"
@@ -25,6 +24,7 @@
 #include "paddle/phi/kernels/funcs/sequence_padding.h"
 #include "paddle/phi/kernels/funcs/sequence_scale.h"
 #include "paddle/utils/optional.h"
+#include "third_party/warpctc/include/ctc.h"
 
 namespace phi {
 
@@ -59,15 +59,15 @@ class ComputeCtcLossFunctor<Context, float> {
                          void* workspace,
                          ctcOptions options) {
     return compute_ctc_loss(activations,
-                                          gradients,
-                                          flat_labels,
-                                          label_lengths,
-                                          input_lengths,
-                                          static_cast<int>(alphabet_size),
-                                          static_cast<int>(minibatch),
-                                          costs,
-                                          workspace,
-                                          options);
+                            gradients,
+                            flat_labels,
+                            label_lengths,
+                            input_lengths,
+                            static_cast<int>(alphabet_size),
+                            static_cast<int>(minibatch),
+                            costs,
+                            workspace,
+                            options);
   }
 };
 
@@ -84,17 +84,16 @@ class ComputeCtcLossFunctor<Context, double> {
                          double* costs,
                          void* workspace,
                          ctcOptions options) {
-    return compute_ctc_loss_double(
-        activations,
-        gradients,
-        flat_labels,
-        label_lengths,
-        input_lengths,
-        static_cast<int>(alphabet_size),
-        static_cast<int>(minibatch),
-        costs,
-        workspace,
-        options);
+    return compute_ctc_loss_double(activations,
+                                   gradients,
+                                   flat_labels,
+                                   label_lengths,
+                                   input_lengths,
+                                   static_cast<int>(alphabet_size),
+                                   static_cast<int>(minibatch),
+                                   costs,
+                                   workspace,
+                                   options);
   }
 };
 
@@ -140,21 +139,19 @@ class WarpCTCFunctor {
     size_t workspace_bytes = 0;
     ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR;
     if (sizeof(T) == 4) {
-      status =
-          get_workspace_size(cpu_label_lengths,
-                                           cpu_input_lengths,
-                                           static_cast<int>(sequence_width),
-                                           static_cast<int>(num_sequences),
-                                           options_,
-                                           &workspace_bytes);
+      status = get_workspace_size(cpu_label_lengths,
+                                  cpu_input_lengths,
+                                  static_cast<int>(sequence_width),
+                                  static_cast<int>(num_sequences),
+                                  options_,
+                                  &workspace_bytes);
     } else {
-      status = get_workspace_size_double(
-          cpu_label_lengths,
-          cpu_input_lengths,
-          static_cast<int>(sequence_width),
-          static_cast<int>(num_sequences),
-          options_,
-          &workspace_bytes);
+      status = get_workspace_size_double(cpu_label_lengths,
+                                         cpu_input_lengths,
+                                         static_cast<int>(sequence_width),
+                                         static_cast<int>(num_sequences),
+                                         options_,
+                                         &workspace_bytes);
     }
     PADDLE_ENFORCE_EQ(
         CTC_STATUS_SUCCESS,
diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
index 457fdcb9bff..8e3ab6fcdac 100644
--- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h
@@ -16,12 +16,12 @@
 
 #include <vector>
 
-#include "third_party/warprnnt/include/rnnt.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "third_party/warprnnt/include/rnnt.h"
 
 namespace phi {
 
@@ -56,15 +56,15 @@ class ComputeRnntLossFunctor<Context, float> {
                           void* workspace,
                           rnntOptions options) {
     return compute_rnnt_loss(activations,
-                                           gradients,
-                                           label,
-                                           label_lengths,
-                                           input_lengths,
-                                           static_cast<int>(alphabet_size),
-                                           static_cast<int>(minibatch),
-                                           costs,
-                                           workspace,
-                                           options);
+                             gradients,
+                             label,
+                             label_lengths,
+                             input_lengths,
+                             static_cast<int>(alphabet_size),
+                             static_cast<int>(minibatch),
+                             costs,
+                             workspace,
+                             options);
   }
 };
 
@@ -82,15 +82,15 @@ class ComputeRnntLossFunctor<Context, double> {
                           void* workspace,
                           rnntOptions options) {
     return compute_rnnt_loss_fp64(activations,
-                                                gradients,
-                                                label,
-                                                label_lengths,
-                                                input_lengths,
-                                                static_cast<int>(alphabet_size),
-                                                static_cast<int>(minibatch),
-                                                costs,
-                                                workspace,
-                                                options);
+                                  gradients,
+                                  label,
+                                  label_lengths,
+                                  input_lengths,
+                                  static_cast<int>(alphabet_size),
+                                  static_cast<int>(minibatch),
+                                  costs,
+                                  workspace,
+                                  options);
   }
 };
 
@@ -117,6 +117,7 @@ class WarpRNNTFunctor {
    * \param blank             blank label used in rnnt loss function.
    * \param cpu_loss         loss of each example in CPU memory.
    */
+
   void operator()(const Context& dev_ctx,
                   const T* input,
                   T* gradient,

From a513aaeb4c895177cd1c6b91d8d3b3c6b8ffe5a6 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 11:07:44 +0800
Subject: [PATCH 21/95] change_build_917 (#29)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index e3c4304e5f8..2bee14930a3 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -24,14 +24,14 @@ pip  uninstall paddlepaddle -y
 git submodule sync --recursive && git submodule update --init --recursive
 
 
-export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
-export
+# export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
+# export
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 
-unset http_proxy https_proxy
+# unset http_proxy https_proxy
 
 # apply patch
 bash change_patch.sh

From 4eb455e0f14f4a74bfd91e3fd44d67500af2a2c0 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 11:19:49 +0800
Subject: [PATCH 22/95] chang_build (#30)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index 2bee14930a3..de409153472 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -22,12 +22,16 @@ pip  uninstall paddlepaddle -y
 
 # init paddle
 git submodule sync --recursive && git submodule update --init --recursive
+sleep 1000000
+unset http_proxy https_proxy
 
 
 # export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080
 # export
 pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package
 # install paddle
+
+
 python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 
 

From 1773978409b36845416e6491a6b5a2e06ff49992 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Wed, 17 Sep 2025 13:59:58 +0800
Subject: [PATCH 23/95] [metax]modify kernel (#31)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel
---
 backends/metax_gpu/patch/paddle.patch | 257 ++++++++++++++------------
 1 file changed, 138 insertions(+), 119 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index e56826c4f3e..667d9f75d1c 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644
 -  set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
 +  # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
  endif()
-
+ 
  set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
 diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
 index bff0f2bf70..9376b5781f 100644
 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc
 +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
 @@ -16,7 +16,7 @@
-
+ 
  #include <cstdio>
-
+ 
 -#include "paddle/fluid/platform/enforce.h"
 +// #include "paddle/fluid/platform/enforce.h"
  #include "paddle/phi/core/os_info.h"
@@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnDestroyActivationDescriptor);               \
 +  __macro(cudnnSetRNNDescriptor_v6);
  CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
+ 
  #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
@@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnRNNForwardInferenceEx);
  CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
-
+ 
 @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
-
+ 
 -#if CUDNN_VERSION < 90000
 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
 -  __macro(cudnnGetRNNParamsSize);                     \
@@ -132,15 +132,15 @@ index c0080f0a5e..458ca3e2e8 100644
 -#endif
  }  // namespace dynload
  }  // namespace phi
-
+ 
 diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
-index 1547909d92..66b2779392 100644
+index 1547909d92..ef20838434 100644
 --- a/paddle/phi/backends/dynload/cufft.h
 +++ b/paddle/phi/backends/dynload/cufft.h
 @@ -1,3 +1,4 @@
 +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
          cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
@@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644
 @@ -24,8 +24,8 @@ limitations under the License. */
  #include "paddle/phi/backends/dynload/dynamic_loader.h"
  #include "paddle/phi/common/port.h"
-
+ 
 -namespace phi {
 -namespace dynload {
 +// namespace phi {
 +// namespace dynload {
-
+ 
  extern std::once_flag cupti_dso_flag;
  extern void *cupti_dso_handle;
 @@ -71,7 +71,7 @@ extern void *cupti_dso_handle;
  CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
-
+ 
  #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
 -}  // namespace dynload
 -}  // namespace phi
 +// }  // namespace dynload
 +// }  // namespace phi
-
+ 
 -#endif  // PADDLE_WITH_CUPTI
 +#endif  // PADDLE_WITH_CUPTI
 \ No newline at end of file
@@ -226,32 +226,32 @@ index c5309e7e11..3328571380 100644
      }                                                              \
    };                                                               \
 diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
-index 4ff2e528a9..81421c8ca1 100644
+index 4ff2e528a9..23f7f4b583 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 @@ -1,3 +1,4 @@
 +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -25,7 +26,7 @@ namespace phi {
  namespace backends {
  namespace gpu {
-
+ 
 -#define FULL_WARP_MASK 0xFFFFFFFF
 +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
  #define CREATE_SHFL_MASK(mask, predicate) \
    mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-
+ 
 @@ -45,12 +46,12 @@ namespace gpu {
-
+ 
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
 +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) {
    return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
  }
-
+ 
  template <typename T>
 -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
 +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask,
@@ -259,7 +259,7 @@ index 4ff2e528a9..81421c8ca1 100644
                                                  int width = warpSize) {
    return __shfl_xor_sync(mask, val, width);
 @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::float16 val, int delta, int width) {
@@ -267,7 +267,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::float16(__shfl_down_sync(
        mask, val.to_half(), static_cast<unsigned>(delta), width));
  }
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
@@ -276,7 +276,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(__shfl_down_sync(
        mask, val.to_nv_bfloat16(), static_cast<unsigned>(delta), width));
 @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
@@ -285,7 +285,7 @@ index 4ff2e528a9..81421c8ca1 100644
        mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
    float imag = static_cast<float>(__shfl_down_sync(
 @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
@@ -294,14 +294,14 @@ index 4ff2e528a9..81421c8ca1 100644
        static_cast<double>(__shfl_down_sync(mask,
                                             static_cast<double>(val.real),
 @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::float16 val, int width) {
 +    unsigned long long mask, phi::dtype::float16 val, int width) {
    return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
  }
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int width) {
@@ -310,7 +310,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(
        __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
 @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<float> val, int width) {
@@ -319,7 +319,7 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<float>(val.real), width));
    float imag = static_cast<float>(
 @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<double> val, int width) {
@@ -328,14 +328,14 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<double>(val.real), width));
    double imag = static_cast<double>(
 @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
-
+ 
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
 +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) {
    return __shfl_sync(mask, val, src_line, width);
  }
-
+ 
 @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // but most card's warp size is 32.
    const int warpSize = 32;
@@ -343,7 +343,7 @@ index 4ff2e528a9..81421c8ca1 100644
 -  unsigned mask = 0u;
 +  unsigned long long mask = 0ull;
    CREATE_SHFL_MASK(mask, tid < len);
-
+ 
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
 index 024a7de73e..1e4cdf16be 100644
@@ -351,7 +351,7 @@ index 024a7de73e..1e4cdf16be 100644
 +++ b/paddle/phi/core/enforce.h
 @@ -45,7 +45,9 @@ limitations under the License. */
  #endif
-
+ 
  #ifdef PADDLE_WITH_CUDA
 -#include "paddle/phi/backends/dynload/cublas.h"
 +// #include "paddle/phi/backends/dynload/../../../../../cublas.h"
@@ -361,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644
  #include "paddle/phi/backends/dynload/curand.h"
  #include "paddle/phi/backends/dynload/cusolver.h"
 @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; }
-
+ 
  void ThrowWarnInternal(const std::string& message);
-
+ 
 -#if defined(__CUDA_ARCH__)
 +#if defined(__CUDACC__)
  // For cuda, the assertions can affect performance and it is therefore
@@ -379,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644
    } while (0)
  #elif defined(__HIPCC__)
 @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) {
-
+ 
  }  // namespace enforce
  using namespace enforce;  // NOLINT
 -}  // namespace phi
@@ -392,7 +392,7 @@ index c646e487d0..325122175c 100644
 @@ -25,8 +25,9 @@
  #else
  #include <cuda_runtime.h>
-
+ 
 -#include "paddle/phi/backends/dynload/cublas.h"
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublas.h"
@@ -400,16 +400,16 @@ index c646e487d0..325122175c 100644
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #include "paddle/phi/backends/dynload/cudnn.h"
  #endif
-
+ 
 @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
-
+ 
  // TODO(Ming Huang): Since there is no blasLt handler,
  // use rocblas_handle for workaround.
 -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
-
+ 
  #undef DECLARE_TYPE_FOR_GPU
-
+ 
 diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h
 index 2d02eb370b..8a7233e34e 100644
 --- a/paddle/phi/core/platform/device_context.h
@@ -430,58 +430,58 @@ index d69eb67d6f..1d8b6e9375 100644
 --- a/paddle/phi/kernels/cpu/index_select_impl.h
 +++ b/paddle/phi/kernels/cpu/index_select_impl.h
 @@ -18,7 +18,7 @@
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/backends/all_context.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/fc_functor.h"
-
+ 
  #include "paddle/phi/backends/gpu/gpu_launch_config.h"
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
 index 88663ec880..98b93072a3 100644
 --- a/paddle/phi/kernels/funcs/gru_compute.cu
 +++ b/paddle/phi/kernels/funcs/gru_compute.cu
 @@ -12,7 +12,7 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/gru_compute.h"
-
+ 
  #include "paddle/phi/backends/gpu/gpu_context.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h
 +++ b/paddle/phi/kernels/funcs/math/context_project.h
 @@ -18,7 +18,7 @@
  #include <vector>
-
+ 
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/im2col.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
@@ -489,14 +489,14 @@ index e101224970..a52eb6096f 100644
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
  #include "paddle/phi/common/memory_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
-
+ 
  namespace phi {
  namespace funcs {
-
+ 
 +
 +
  template <typename Context, typename T>
@@ -514,28 +514,28 @@ index 558d363b39..05da04b517 100644
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 index 8b0baf5f5f..260482f124 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;
-
+ 
  #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
-
+ 
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
-index e30d440ff3..3c74792690 100644
+index e30d440ff3..108edda7ca 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
 +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 @@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
-
+ 
 -#define FINAL_MASK 0xffffffff
 +#define FINAL_MASK 0xffffffffffffffffull
  #ifdef PADDLE_WITH_HIP
@@ -545,7 +545,7 @@ index e30d440ff3..3c74792690 100644
 +#define WARP_SIZE 64
  #endif
  #define MAX_NUM_THREADS 1024
-
+ 
 @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
@@ -606,7 +606,7 @@ index e30d440ff3..3c74792690 100644
 +  topk[0 + offset].v = p.v;
 +  topk[0 + offset].id = p.id;
  }
-
+ 
  template <typename T, int BlockSize>
 @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
@@ -662,7 +662,7 @@ index e30d440ff3..3c74792690 100644
 +            // topk + MaxLength - *beam, src, tid, dim, *max, length, largest);
        }
      }
-
+ 
 @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
@@ -697,7 +697,7 @@ index e30d440ff3..3c74792690 100644
 -    if (--(*k) == 0) break;
 +    // if (--(*k) == 0) break;
 +    unsigned long long mask = 0ull;
-
+ 
 -    unsigned mask = 0u;
 +    // unsigned mask = 0u;
      CREATE_SHFL_MASK(mask, true);
@@ -721,7 +721,7 @@ index e30d440ff3..3c74792690 100644
 +
      return ret;
    }
-
+ 
    static __device__ __forceinline__ unsigned int SetBitfield(
        unsigned int val, unsigned int to_insert, int pos, int len) {
      unsigned int ret;
@@ -743,7 +743,7 @@ index e30d440ff3..3c74792690 100644
 +    ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
      return ret;
    }
-
+ 
 @@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
@@ -771,7 +771,7 @@ index e30d440ff3..3c74792690 100644
 +  return ::__lane_id();
 +  // return lane_id;
  }
-
+ 
  __device__ __forceinline__ unsigned GetLaneMaskLe() {
    unsigned mask;
 -  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
@@ -780,17 +780,17 @@ index e30d440ff3..3c74792690 100644
 +  return ((uint64_t(1) << ::__lane_id()) << 1) - 1;
 +  // return mask;
  }
-
+ 
  template <typename T, bool KillDependency, class Function>
 @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
-
+ 
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
 -  RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  // RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  RadixSearch<T, typename RadixTypeConfig<T>::RadixType, IndexType, false>(
        cur_input, k, num_cols, shared_mem, &kth_value);
-
+ 
    __shared__ int64_t block_min_idx;
 @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
@@ -803,12 +803,12 @@ index 32db61532f..0220316bc3 100644
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
  #pragma once
-
+ 
  #if defined(PADDLE_WITH_CUDA)
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #endif
-
+ 
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 index 9d4bb18d55..ea42cc10a9 100644
@@ -830,12 +830,12 @@ index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 @@ -14,7 +14,7 @@
-
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
-
+ 
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
@@ -843,14 +843,27 @@ index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 @@ -14,7 +14,7 @@
-
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
-
+ 
  namespace phi {
  namespace fusion {
+diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu
+index 4c93778bde..c7bdf8a2cc 100644
+--- a/paddle/phi/kernels/gpu/correlation_kernel.cu
++++ b/paddle/phi/kernels/gpu/correlation_kernel.cu
+@@ -103,7 +103,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx,
+                            int stride2,
+                            int corr_type_multiply,
+                            DenseTensor *out) {
+-  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
++  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM;
+   PADDLE_ENFORCE_EQ(
+       is_gpu_place,
+       true,
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
 index f0cca0f701..02ea957240 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -863,9 +876,22 @@ index f0cca0f701..02ea957240 100644
 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 +#include "kernels/gpudnn/conv_gpudnn.h"
 +#include "kernels/impl/conv_cudnn_impl.h"
-
+ 
  namespace phi {
  // To determine use cudnn or not.
+diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu
+index c2ddfa1347..c6adf5a6de 100644
+--- a/paddle/phi/kernels/gpu/dgc_kernel.cu
++++ b/paddle/phi/kernels/gpu/dgc_kernel.cu
+@@ -188,7 +188,7 @@ void DGCKernel(const Context& dev_ctx,
+   int buf_size = paddle::communication::dgc::get_buffer_size(k);
+   phi::Allocator::AllocationPtr tmp_ious_data;
+ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+-  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
++  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
+     tmp_ious_data = phi::memory_utils::Alloc(
+         dev_ctx.GetPlace(),
+         buf_size,
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
 index 29fa252e96..4ae72b0935 100644
 --- a/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -890,7 +916,7 @@ index 29fa252e96..4ae72b0935 100644
 +// #endif
    return tanhf(x);
  }
-
+ 
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -901,9 +927,9 @@ index 11efd87965..679db14c24 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
-
+ 
  namespace phi {
-
+ 
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -914,9 +940,9 @@ index 63c35dd4ee..15da9aea45 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
-
+ 
  namespace phi {
-
+ 
 diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
 index 1bdbe1564c..f753b54bc6 100644
 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -930,6 +956,19 @@ index 1bdbe1564c..f753b54bc6 100644
  #include "paddle/phi/kernels/impl/qr_kernel_impl.h"
  #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
  #include "paddle/phi/kernels/lstsq_kernel.h"
+diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
+index 05a977828f..5136608c41 100644
+--- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
++++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
+@@ -58,7 +58,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
+   int64_t seed_int = 0;
+   if (seed.initialized()) {
+     const auto& seed_place = seed.place().GetType();
+-    bool is_gpu_place = seed_place == phi::AllocationType::GPU;
++    bool is_gpu_place = seed_place == phi::AllocationType::GPU || seed_place == phi::AllocationType::CUSTOM;
+     if (is_gpu_place) {
+       // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would
+       // not be CUDAPlace in practice. This case would only happen in Python
 diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 index 9bc5326c90..79b57a8203 100644
 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -948,7 +987,7 @@ index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_grad_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -961,14 +1000,14 @@ index 2789cb59a2..b91b076f7f 100644
 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 @@ -20,7 +20,7 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-
+ 
 diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
 index 9a21c23666..86413d1577 100644
 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -993,7 +1032,7 @@ index 4459a931da..837c8682b8 100644
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
 index ad9e9197dd..5478d9817d 100644
@@ -1013,31 +1052,31 @@ index e6b3960f6d..564125f1f6 100644
 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) {
-
+ 
  template <typename T>
  HOSTDEVICE T igamc(const T a, const T x) {
 -  static T big = 4.503599627370496e15;
 -  static T biginv = 2.22044604925031308085e-16;
 +  const static T big = 4.503599627370496e15;
 +  const static T biginv = 2.22044604925031308085e-16;
-
+ 
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
-
+ 
 diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 index 410fb3c560..009ce03440 100644
 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
-
+ 
  template <typename T>
  HOSTDEVICE T digamma(T x) {
 -  static T pi = T{3.14159265358979323846};
 +  const static T pi = T{3.14159265358979323846};
-
+ 
    if (x == T{0.0}) {
      T inf = std::numeric_limits<T>::infinity();
 diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
-index 5ebbc8d2db..48acf8d0cd 100644
+index 5ebbc8d2db..c7b6c338e2 100644
 --- a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
 @@ -15,8 +15,9 @@ limitations under the License. */
@@ -1049,11 +1088,11 @@ index 5ebbc8d2db..48acf8d0cd 100644
 +#include "kernels/funcs/blas/cublaslt.h"
 +#include "kernels/funcs/quant_dequant.h"
 +#include "kernels/metax_kernel/metax_context.h"
-
+ 
  #pragma once
-
+ 
 @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx,
-
+ 
    {
      auto helper =
 -        std::make_unique<CublasLtHelper>(m, k, n, dev_ctx.cublaslt_handle());
@@ -1067,12 +1106,12 @@ index 1f319c4ae3..9186eb6906 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 index 6f03f76eeb..5fe2c3e7dc 100644
@@ -1080,13 +1119,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
 diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
 index 7b85903776..3f4b298807 100644
 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -1118,31 +1157,11 @@ index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 @@ -14,7 +14,7 @@
-
+ 
  #pragma once
-
+ 
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
-
-diff --git a/third_party/flagcx b/third_party/flagcx
-index 7c469f4af9..7e6c4cc3ca 160000
---- a/third_party/flagcx
-+++ b/third_party/flagcx
-@@ -1 +1 @@
--Subproject commit 7c469f4af991bf0f64b8f76d66f8e307a5eaea3f
-+Subproject commit 7e6c4cc3cad3fce9b3dedfe46a9d195d616e8ffa
-diff --git a/third_party/flashattn b/third_party/flashattn
-index 581e48aa69..749aca3807 160000
---- a/third_party/flashattn
-+++ b/third_party/flashattn
-@@ -1 +1 @@
--Subproject commit 581e48aa693a17ec3676ec2715d46130310d318d
-+Subproject commit 749aca380794b472096d4e7ea01dd252ab0887c9
-diff --git a/third_party/yaml-cpp b/third_party/yaml-cpp
---- a/third_party/yaml-cpp
-+++ b/third_party/yaml-cpp
-@@ -1 +1 @@
--Subproject commit 1d8ca1f35eb3a9c9142462b28282a848e5d29a91
-+Subproject commit 1d8ca1f35eb3a9c9142462b28282a848e5d29a91-dirty
+ 

From 69af38186ebfd6029d6e5b1a057d6e8fa389ee08 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 17:07:26 +0800
Subject: [PATCH 24/95] change_metax_work (#32)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 0d3d2637cdd..c23112f0545 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -19,27 +19,28 @@ defaults:
 jobs:
   metax-gpu-test:
     runs-on: paddle-metax-runner-set
+    # runs-on: debug-paddle-runner-set
     steps:
       - name: Checkout repository
         run: |
           git config --global user.name "GitHub Actions"
           git config --global user.email "actions@github.com"
 
-          if [ "${{ github.event_name }}" == "pull_request" ]; then
-            BRANCH_NAME=${{ github.head_ref }}
-          else
-            BRANCH_NAME=${{ github.ref_name }}
-          fi
-
           git clone \
             --reference-if-able /home/runner/PaddleCustomDevice \
             --depth=1 \
             --shallow-submodules \
             --jobs=8 \
-            --branch $BRANCH_NAME \
+            --branch ${{ github.base_ref }} \
             --recurse-submodules \
             https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
 
+          if [ "${{ github.event_name }}" == "pull_request" ]; then
+            git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
+            git checkout pull/${{ github.event.pull_request.number }}/head
+            git submodule update --init --recursive
+          fi
+
 
       - name: compile
         run: |

From 7fe6f2dca92c3c0e3fb4c4ceb7f18a26560422e9 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 17:18:26 +0800
Subject: [PATCH 25/95] change_build (#33)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 backends/metax_gpu/build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index de409153472..dbd583c52ea 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -22,8 +22,8 @@ pip  uninstall paddlepaddle -y
 
 # init paddle
 git submodule sync --recursive && git submodule update --init --recursive
-sleep 1000000
-unset http_proxy https_proxy
+# sleep 1000000
+# unset http_proxy https_proxy
 
 
 # export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080

From b22fc1317d786931c1aa8784ad30dd72b6dfc2fd Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Wed, 17 Sep 2025 17:58:21 +0800
Subject: [PATCH 26/95] [metax] modify fused_bias_dropout_residual_layer_norm
 (#34)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm
---
 backends/metax_gpu/patch/paddle.patch | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 667d9f75d1c..b7bdb953077 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -470,6 +470,25 @@ index 88663ec880..98b93072a3 100644
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
  
+diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+index 4eae698648..5c047723ea 100644
+--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
++++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
+@@ -43,11 +43,11 @@ template <typename T>
+ using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
+ 
+ inline static int GetDesiredBlockDim(int64_t block_dim) {
+-  const int kMaxBlockDim = 512;
++  const int kMaxBlockDim = 256;
+ #ifdef __HIPCC__
+   const int lwarpSize = 64;
+ #else
+-  const int lwarpSize = 32;
++  const int lwarpSize = 64;
+ #endif
+   return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
+ }
+
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h

From c3d1444ef67441b9bb43f9fa5ee7c5a906a7f9df Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 18:18:30 +0800
Subject: [PATCH 27/95] change_build (#35)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 6 ++++--
 backends/metax_gpu/build.sh       | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index c23112f0545..74de39c2e13 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -31,14 +31,16 @@ jobs:
             --depth=1 \
             --shallow-submodules \
             --jobs=8 \
-            --branch ${{ github.base_ref }} \
+            --branch ${{ github.base_ref || github.ref_name}} \
+
             --recurse-submodules \
             https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
 
           if [ "${{ github.event_name }}" == "pull_request" ]; then
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
             git checkout pull/${{ github.event.pull_request.number }}/head
-            git submodule update --init --recursive
+
+            # git submodule update --init --recursive
           fi
 
 
diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index dbd583c52ea..042b779a05c 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -21,7 +21,8 @@ pip  uninstall paddlepaddle -y
 
 
 # init paddle
-git submodule sync --recursive && git submodule update --init --recursive
+# git submodule sync --recursive && git submodule update --init --recursive
+
 # sleep 1000000
 # unset http_proxy https_proxy
 

From 569a867b358d9d3707c8d41dbbb0641d03e75de8 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 17 Sep 2025 18:21:54 +0800
Subject: [PATCH 28/95] change_build (#36)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 74de39c2e13..51c0c62cef6 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -32,7 +32,6 @@ jobs:
             --shallow-submodules \
             --jobs=8 \
             --branch ${{ github.base_ref || github.ref_name}} \
-
             --recurse-submodules \
             https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
 

From 0edc6f6549fff51d459bf9a77bfbedf4e6a33beb Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 10:46:15 +0800
Subject: [PATCH 29/95] change_warpctc.cmake (#38)

* change_warpctc.cmake
---
 backends/metax_gpu/cmake/warpctc.cmake | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index 0733c0f9ce5..ea8e2ade754 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -35,6 +35,13 @@ else()
       git checkout -- . && git checkout ${WARPCTC_TAG} && patch -Nd
       ${SOURCE_DIR} <
       ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch)
+  file(COPY ${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh
+       DESTINATION ${SOURCE_DIR}/include/contrib/moderngpu/include/device/)
+  message(STATUS "atch file path: ${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh")
+  message(
+    STATUS
+      "ModernGPU device path: ${SOURCE_DIR}/include/contrib/moderngpu/include/device/"
+  )
 endif()
 
 if(NOT WIN32 AND WITH_GPU)

From 2688c8664cc50961267be572ed467ce4b89bc351 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 11:44:44 +0800
Subject: [PATCH 30/95] change_warpctc.cmake (#39)

* change warpctc.cmake
---
 backends/metax_gpu/change_patch.sh     | 3 ++-
 backends/metax_gpu/cmake/warpctc.cmake | 5 +++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh
index 60d74ec0f3d..f29986a3780 100644
--- a/backends/metax_gpu/change_patch.sh
+++ b/backends/metax_gpu/change_patch.sh
@@ -21,8 +21,9 @@ unzip mcEigen_3.4.0_paddle_final.zip
 mv mcEigen_3.4.0_paddle_final eigen3
 cd ..
 cp -r patch/eigen3/ ../../Paddle/third_party/eigen3
+rm -r patch/eigen3
 cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core
 cd ../../Paddle/
 git apply --verbose ../backends/metax_gpu/patch/paddle.patch
 cd -
-cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/
+# cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/
diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake
index ea8e2ade754..5d668032fb1 100644
--- a/backends/metax_gpu/cmake/warpctc.cmake
+++ b/backends/metax_gpu/cmake/warpctc.cmake
@@ -108,6 +108,10 @@ else()
   set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG})
 endif()
 
+set(COPY_COMMAND
+    ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh"
+    "${SOURCE_DIR}/include/contrib/moderngpu/include/device/")
+
 ExternalProject_Add(
   extern_warpctc
   ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -117,6 +121,7 @@ ExternalProject_Add(
   PATCH_COMMAND
   COMMAND ${WARPCTC_PATCH_COMMAND}
   COMMAND ${WARPCTC_PATCH_CUDA_COMMAND}
+  COMMAND ${COPY_COMMAND}
   COMMAND ${WARPCTC_PATHCH_ROCM_COMMAND}
   # BUILD_ALWAYS    1
   CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

From 6f031fe12a2020044b898b2b2921c899df3d4e3a Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 12:10:23 +0800
Subject: [PATCH 31/95] test (#40)

* test

---------
---
 backends/metax_gpu/tests/run_test.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 95cce650e6b..92dea2b492b 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -22,6 +22,8 @@ TEST_PATH1="${SCRIPT_DIR}/../../../python"
 TEST_PATH2="${SCRIPT_DIR}/../../../python/tests"
 export PYTHONPATH="${LEGACY_TEST_PATH}:${PYTHONPATH}:${TEST_PATH1}:${TEST_PATH2}"
 
+export
+sleep 1000000
 
 rm -r build
 mkdir -p build && cd build

From e84d399d6056f6dd017031514045a608e717b223 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 14:42:12 +0800
Subject: [PATCH 32/95] test_ut (#41)

* change_run_ut

---------
---
 backends/metax_gpu/tests/run_test.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 92dea2b492b..7d1e8e072a9 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -23,7 +23,8 @@ TEST_PATH2="${SCRIPT_DIR}/../../../python/tests"
 export PYTHONPATH="${LEGACY_TEST_PATH}:${PYTHONPATH}:${TEST_PATH1}:${TEST_PATH2}"
 
 export
-sleep 1000000
+# sleep 1000000
+
 
 rm -r build
 mkdir -p build && cd build
@@ -34,4 +35,4 @@ cmake ..
 cmake --build .
 
 
-ctest -j1 --output-on-failure
+ctest -j10 --output-on-failure

From b5f2feb398cae8217d1dff39a5e7ef31afa0e02d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 16:59:28 +0800
Subject: [PATCH 33/95] tets (#43)

* remove_tets

---------
---
 backends/metax_gpu/build.sh             | 2 +-
 backends/metax_gpu/tests/CMakeLists.txt | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh
index 042b779a05c..9ca589a7807 100755
--- a/backends/metax_gpu/build.sh
+++ b/backends/metax_gpu/build.sh
@@ -57,7 +57,7 @@ fi
 
 echo "make_maca"
 cd build
-cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
+cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
 make_maca -j60
 
 echo "install whl"
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 410ef006514..08273782be6 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -81,8 +81,7 @@ list(
   ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_reduce_op.py)
+  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py)
 
 list(
   REMOVE_ITEM

From e20eca7e6f9846583293e988b7484380a25f314f Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 18 Sep 2025 18:53:51 +0800
Subject: [PATCH 34/95] test (#44)

* test

---------
---
 backends/metax_gpu/tests/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 08273782be6..795a3c5b8ac 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -95,7 +95,7 @@ list(
   ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  # op_test.py 里 self._get_places()接口适配问题
+  # op_test.py 里 self._get_places()接口的适配问题
   ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
   ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
   # device == "gpu" 适配问题

From e37f633a4d440a25126273ccddd7c3ff23288a02 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Fri, 19 Sep 2025 18:30:47 +0800
Subject: [PATCH 35/95] [metax] modify compile (#42)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas
---
 backends/metax_gpu/CMakeLists.txt             |   40 +-
 backends/metax_gpu/compile.sh                 |    2 +-
 .../kernels/funcs/blas/blas_impl.cu.h         | 1270 ++++++++---------
 .../fused_adam_kernel_register.cu             |    0
 ...esidual_layer_norm_grad_kernel_register.cu |    0
 ...out_residual_layer_norm_kernel_register.cu |    0
 ...dding_eltwise_layernorm_kernel_register.cu |    0
 .../fused_layernorm_kernel_register.cu        |    0
 .../fused_seqpool_cvm_grad_kernel_register.cu |    0
 .../fused_seqpool_cvm_kernel_register.cu      |    0
 ...fused_softmax_mask_grad_kernel_register.cu |    0
 .../fused_softmax_mask_kernel_register.cu     |    0
 ...max_mask_upper_triangle_kernel_register.cu |    0
 ...d_stack_transpose_quant_kernel_register.cu |    0
 ...sed_swiglu_weighted_bwd_kernel_register.cu |   30 +
 .../fused_token_prune_kernel_register.cu      |    0
 ...d_transpose_split_quant_kernel_register.cu |    0
 ...nspose_wlch_split_quant_kernel_register.cu |    0
 .../kernels/metax_kernel/metax_context.cc     |   35 -
 .../kernels/metax_kernel/metax_context.h      |    2 -
 20 files changed, 597 insertions(+), 782 deletions(-)
 mode change 100755 => 100644 backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_adam_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_bias_dropout_residual_layer_norm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_embedding_eltwise_layernorm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_layernorm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_seqpool_cvm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_seqpool_cvm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_softmax_mask_upper_triangle_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_stack_transpose_quant_kernel_register.cu (100%)
 create mode 100644 backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_token_prune_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_transpose_split_quant_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => fusion}/fused_transpose_wlch_split_quant_kernel_register.cu (100%)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index f282a9fbf7c..7b8c52f1f31 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -70,7 +70,6 @@ include(eigen)
 include(xxhash)
 include(zlib)
 include(protobuf)
-include(generate_pb)
 
 set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto")
 get_filename_component(PROTO_WE "${PROTO_FILE}" NAME_WE)
@@ -614,12 +613,9 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math_function.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/backends/context_pool.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/repeat_tensor2index_tensor.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/binomial_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_grad_kernel_impl.h
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu
@@ -642,29 +638,11 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gather_tree_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/graph_reindex_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/group_norm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_act_dequant_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/fp8_gemm/fp8_gemm_with_cublasLt/fp8_fp8_half_gemm.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/variable_length_memory_efficient_attention_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/gemm_epilogue_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_grad_kernel.cu
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_real_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_complex_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_grad_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/shape_kernel.cc
-  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/conv_kernel_igemm.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
   # ############################################################################
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
   # kernels/kps
@@ -697,7 +675,6 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/int_bincount_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu)
 
 file(
@@ -707,6 +684,8 @@ file(
   passes/*.cc
   kernels/*.cc
   kernels/*.cu
+  kernels/fusion/*.cc
+  kernels/fusion/*.cu
   kernels/gpudnn/*.cc
   kernels/gpudnn/*.cu
   kernels/cuda_kernels/*.cc
@@ -721,13 +700,7 @@ set_source_files_properties(${CUSTOM_DEVICE_SRCS} PROPERTIES LANGUAGE CUDA)
 set(CMAKE_CUCC_COMPILER "cucc")
 set(CMAKE_CUCC_FLAGS "-I /opt/maca/tools/cu-bridge/include/")
 
-set_source_files_properties(
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
-  PROPERTIES LANGUAGE CUDA)
-add_library(
-  ${TARGET_NAME} SHARED
-  ${CUSTOM_DEVICE_SRCS}
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu)
+add_library(${TARGET_NAME} SHARED ${CUSTOM_DEVICE_SRCS})
 
 target_include_directories(
   ${TARGET_NAME}
@@ -753,9 +726,6 @@ target_link_libraries(
   ${WARPCTC_LIBRARIES}
   ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so)
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
 
 target_compile_definitions(
diff --git a/backends/metax_gpu/compile.sh b/backends/metax_gpu/compile.sh
index e9860ccb7d0..eba45a9ced2 100644
--- a/backends/metax_gpu/compile.sh
+++ b/backends/metax_gpu/compile.sh
@@ -30,7 +30,7 @@ fi
 
 echo "make_maca"
 cd build
-cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
+cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON
 make_maca -j10
 
 
diff --git a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
old mode 100755
new mode 100644
index 419387cc9c4..ae4baa52613
--- a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
+++ b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
@@ -34,70 +34,6 @@ PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
 
 namespace phi {
 namespace funcs {
-
-inline static cublasHandle_t blas_handle_ = nullptr;
-inline static cublasHandle_t blas_tensor_core_handle_ = nullptr;
-inline static cublasHandle_t blas_tf32_tensor_core_handle_ = nullptr;
-
-inline std::once_flag flag_sparse_;
-inline std::once_flag flag_blas_;
-inline std::once_flag flag_blaslt_;
-inline std::once_flag flag_dnn_;
-inline std::once_flag flag_solver_;
-inline std::once_flag flag_cublas_;
-inline std::once_flag flag_tensorcore_cublas_;
-inline std::once_flag flag_eigen_device_;
-
-inline std::mutex blas_mtx_;
-inline std::mutex blas_tensor_core_mtx_;
-inline std::mutex blas_tf32_mtx_;
-inline std::mutex sparse_mtx_;
-inline std::mutex stream_call_back_mtx_;
-
-inline void InitBlasHandle(cublasHandle_t *blas_handle, gpuStream_t stream) {
-  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
-  PADDLE_RETRY_CUDA_SUCCESS(
-      phi::dynload::cublasSetStream(*blas_handle, stream));
-}
-
-inline void CublasCall(const std::function<void(cublasHandle_t)> &callback,
-                       phi::stream::stream_t stream) {
-  std::call_once(flag_cublas_, [&]() {
-    if (!blas_handle_) InitBlasHandle(&blas_handle_, stream);
-    if (!blas_tensor_core_handle_) {
-      InitBlasHandle(&blas_tensor_core_handle_, stream);
-      PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-          blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-    }
-  });
-  std::lock_guard<std::mutex> guard(blas_mtx_);
-  callback(blas_handle_);
-}
-
-inline bool MetaxTensorCoreAvailable() {
-  return blas_tensor_core_handle_ != nullptr;
-}
-
-inline void TensorCoreCublasCallIfAvailable(
-    const std::function<void(cublasHandle_t)> &callback,
-    phi::stream::stream_t stream) {
-  std::call_once(flag_tensorcore_cublas_, [&]() {
-    if (!blas_handle_) InitBlasHandle(&blas_handle_, stream);
-    if (!blas_tensor_core_handle_) {
-      InitBlasHandle(&blas_tensor_core_handle_, stream);
-      PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-          blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-    }
-  });
-  if (blas_tensor_core_handle_ != nullptr) {
-    std::lock_guard<std::mutex> guard(blas_tensor_core_mtx_);
-    callback(blas_tensor_core_handle_);
-  } else {
-    std::lock_guard<std::mutex> guard(blas_mtx_);
-    callback(blas_handle_);
-  }
-}
-
 template <typename T>
 struct CUBlas;
 
@@ -174,28 +110,26 @@ struct CUBlas<float> {
 // here.
 #if CUDA_VERSION >= 8000
     VLOG(5) << "use_tensor_op_math: "
-            << (MetaxTensorCoreAvailable() ? "True" : "False");
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle,
-                                                                 transa,
-                                                                 transb,
-                                                                 m,
-                                                                 n,
-                                                                 k,
-                                                                 alpha,
-                                                                 A,
-                                                                 Atype,
-                                                                 lda,
-                                                                 B,
-                                                                 Btype,
-                                                                 ldb,
-                                                                 beta,
-                                                                 C,
-                                                                 Ctype,
-                                                                 ldc));
-        },
-        dev_ctx->stream());
+            << (dev_ctx->tensor_core_available() ? "True" : "False");
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle,
+                                                             transa,
+                                                             transb,
+                                                             m,
+                                                             n,
+                                                             k,
+                                                             alpha,
+                                                             A,
+                                                             Atype,
+                                                             lda,
+                                                             B,
+                                                             Btype,
+                                                             ldb,
+                                                             beta,
+                                                             C,
+                                                             Ctype,
+                                                             ldc));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasSgemmEx is not supported on cuda <= 7.5"));
@@ -376,7 +310,7 @@ struct CUBlas<phi::dtype::float16> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -386,31 +320,29 @@ struct CUBlas<phi::dtype::float16> {
     thrust::device_vector<const void *> A_ptr(A, A + batchCount);
     thrust::device_vector<const void *> B_ptr(B, B + batchCount);
     thrust::device_vector<void *> C_ptr(C, C + batchCount);
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::cublasGemmBatchedEx(handle,
-                                                transa,
-                                                transb,
-                                                m,
-                                                n,
-                                                k,
-                                                alpha,
-                                                A_ptr.data().get(),
-                                                Atype,
-                                                lda,
-                                                B_ptr.data().get(),
-                                                Btype,
-                                                ldb,
-                                                beta,
-                                                C_ptr.data().get(),
-                                                Ctype,
-                                                ldc,
-                                                batchCount,
-                                                computeType,
-                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmBatchedEx(handle,
+                                            transa,
+                                            transb,
+                                            m,
+                                            n,
+                                            k,
+                                            alpha,
+                                            A_ptr.data().get(),
+                                            Atype,
+                                            lda,
+                                            B_ptr.data().get(),
+                                            Btype,
+                                            ldb,
+                                            beta,
+                                            C_ptr.data().get(),
+                                            Ctype,
+                                            ldc,
+                                            batchCount,
+                                            computeType,
+                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmBatchedEx is not supported on cuda <= 7.5"));
@@ -486,7 +418,7 @@ struct CUBlas<phi::dtype::float16> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -494,29 +426,27 @@ struct CUBlas<phi::dtype::float16> {
             << (use_tensor_op_math ? "True" : "False");
 #endif  // CUDA_VERSION >= 9000
 
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                                transa,
-                                                                transb,
-                                                                m,
-                                                                n,
-                                                                k,
-                                                                alpha,
-                                                                A,
-                                                                Atype,
-                                                                lda,
-                                                                B,
-                                                                Btype,
-                                                                ldb,
-                                                                beta,
-                                                                C,
-                                                                Ctype,
-                                                                ldc,
-                                                                computeType,
-                                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmEx is not supported on cuda <= 7.5"));
@@ -696,7 +626,7 @@ struct CUBlas<phi::dtype::complex<float>> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -704,29 +634,27 @@ struct CUBlas<phi::dtype::complex<float>> {
             << (use_tensor_op_math ? "True" : "False");
 #endif  // CUDA_VERSION >= 9000
 
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                                transa,
-                                                                transb,
-                                                                m,
-                                                                n,
-                                                                k,
-                                                                alpha,
-                                                                A,
-                                                                Atype,
-                                                                lda,
-                                                                B,
-                                                                Btype,
-                                                                ldb,
-                                                                beta,
-                                                                C,
-                                                                Ctype,
-                                                                ldc,
-                                                                computeType,
-                                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmEx is not supported on cuda <= 7.5"));
@@ -1024,7 +952,7 @@ struct CUBlas<phi::dtype::complex<double>> {
 #if CUDA_VERSION >= 8000
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
 #if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx->tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -1032,29 +960,27 @@ struct CUBlas<phi::dtype::complex<double>> {
             << (use_tensor_op_math ? "True" : "False");
 #endif  // CUDA_VERSION >= 9000
 
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                                transa,
-                                                                transb,
-                                                                m,
-                                                                n,
-                                                                k,
-                                                                alpha,
-                                                                A,
-                                                                Atype,
-                                                                lda,
-                                                                B,
-                                                                Btype,
-                                                                ldb,
-                                                                beta,
-                                                                C,
-                                                                Ctype,
-                                                                ldc,
-                                                                computeType,
-                                                                algo));
-        },
-        dev_ctx->stream());
+    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            transa,
+                                                            transb,
+                                                            m,
+                                                            n,
+                                                            k,
+                                                            alpha,
+                                                            A,
+                                                            Atype,
+                                                            lda,
+                                                            B,
+                                                            Btype,
+                                                            ldb,
+                                                            beta,
+                                                            C,
+                                                            Ctype,
+                                                            ldc,
+                                                            computeType,
+                                                            algo));
+    });
 #else
     PADDLE_THROW(phi::errors::Unimplemented(
         "cublasGemmEx is not supported on cuda <= 7.5"));
@@ -1186,24 +1112,22 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
       PADDLE_THROW(common::errors::Unimplemented(
           "GEMM_EX_64 is not supported on cuda < 12.3"));
     } else {
-      CublasCall(
-          [&](cublasHandle_t handle) {
-            CUBlas<T>::GEMM(handle,
-                            cuTransB,
-                            cuTransA,
-                            N,
-                            M,
-                            K,
-                            &alpha,
-                            B,
-                            ldb,
-                            A,
-                            lda,
-                            &beta,
-                            C,
-                            N);
-          },
-          dev_ctx_.stream());
+      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+        CUBlas<T>::GEMM(handle,
+                        cuTransB,
+                        cuTransA,
+                        N,
+                        M,
+                        K,
+                        &alpha,
+                        B,
+                        ldb,
+                        A,
+                        lda,
+                        &beta,
+                        C,
+                        N);
+      });
     }
 
 #if CUDA_VERSION >= 8000
@@ -1271,24 +1195,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #else
   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<phi::dtype::float16>::GEMM(handle,
-                                          cuTransB,
-                                          cuTransA,
-                                          N,
-                                          M,
-                                          K,
-                                          &h_alpha,
-                                          h_B,
-                                          ldb,
-                                          h_A,
-                                          lda,
-                                          &h_beta,
-                                          h_C,
-                                          N);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<phi::dtype::float16>::GEMM(handle,
+                                      cuTransB,
+                                      cuTransA,
+                                      N,
+                                      M,
+                                      K,
+                                      &h_alpha,
+                                      h_B,
+                                      ldb,
+                                      h_A,
+                                      lda,
+                                      &h_beta,
+                                      h_C,
+                                      N);
+  });
 #endif  // CUDA_VERSION >= 8000
 }
 
@@ -1352,24 +1274,22 @@ void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
       PADDLE_THROW(common::errors::Unimplemented(
           "GEMM_EX_64 is not supported on cuda < 12.3"));
     } else {
-      CublasCall(
-          [&](cublasHandle_t handle) {
-            CUBlas<T>::GEMM(handle,
-                            cuTransB,
-                            cuTransA,
-                            static_cast<int>(N),
-                            static_cast<int>(M),
-                            static_cast<int>(K),
-                            &t_alpha,
-                            B,
-                            static_cast<int>(ldb),
-                            A,
-                            static_cast<int>(lda),
-                            &t_beta,
-                            C,
-                            static_cast<int>(N));
-          },
-          dev_ctx_.stream());
+      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+        CUBlas<T>::GEMM(handle,
+                        cuTransB,
+                        cuTransA,
+                        static_cast<int>(N),
+                        static_cast<int>(M),
+                        static_cast<int>(K),
+                        &t_alpha,
+                        B,
+                        static_cast<int>(ldb),
+                        A,
+                        static_cast<int>(lda),
+                        &t_beta,
+                        C,
+                        static_cast<int>(N));
+      });
     }
 
 #if CUDA_VERSION >= 8000
@@ -1447,24 +1367,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                          CUBLAS_COMPUTE_32F);
 #else
     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<phi::dtype::float16>::GEMM(handle,
-                                            cuTransB,
-                                            cuTransA,
-                                            static_cast<int>(N),
-                                            static_cast<int>(M),
-                                            static_cast<int>(K),
-                                            &h_alpha,
-                                            h_B,
-                                            static_cast<int>(ldb),
-                                            h_A,
-                                            static_cast<int>(lda),
-                                            &h_beta,
-                                            h_C,
-                                            static_cast<int>(N));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<phi::dtype::float16>::GEMM(handle,
+                                        cuTransB,
+                                        cuTransA,
+                                        static_cast<int>(N),
+                                        static_cast<int>(M),
+                                        static_cast<int>(K),
+                                        &h_alpha,
+                                        h_B,
+                                        static_cast<int>(ldb),
+                                        h_A,
+                                        static_cast<int>(lda),
+                                        &h_beta,
+                                        h_C,
+                                        static_cast<int>(N));
+    });
 #endif  // CUDA_VERSION >= 8000
   }
 }
@@ -1503,7 +1421,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
   float h_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -1519,30 +1437,27 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 12030
   } else {
     CheckGEMMNSize(N);
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::cublasGemmEx(handle,
-                                         cuTransB,
-                                         cuTransA,
-                                         N,
-                                         M,
-                                         K,
-                                         &h_alpha,
-                                         B,
-                                         CUDA_R_16BF,
-                                         ldb,
-                                         A,
-                                         CUDA_R_16BF,
-                                         lda,
-                                         &h_beta,
-                                         C,
-                                         CUDA_R_16BF,
-                                         N,
-                                         CUBLAS_COMPUTE_32F,
-                                         algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                            cuTransB,
+                                                            cuTransA,
+                                                            N,
+                                                            M,
+                                                            K,
+                                                            &h_alpha,
+                                                            B,
+                                                            CUDA_R_16BF,
+                                                            ldb,
+                                                            A,
+                                                            CUDA_R_16BF,
+                                                            lda,
+                                                            &h_beta,
+                                                            C,
+                                                            CUDA_R_16BF,
+                                                            N,
+                                                            CUBLAS_COMPUTE_32F,
+                                                            algo));
+    });
   }
 #else
   // raise error
@@ -1621,24 +1536,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #else
     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<phi::dtype::complex<float>>::GEMM(handle,
-                                                   cuTransB,
-                                                   cuTransA,
-                                                   static_cast<int>(N),
-                                                   static_cast<int>(M),
-                                                   static_cast<int>(K),
-                                                   &c_alpha,
-                                                   h_B,
-                                                   static_cast<int>(ldb),
-                                                   h_A,
-                                                   static_cast<int>(lda),
-                                                   &c_beta,
-                                                   h_C,
-                                                   static_cast<int>(N));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<phi::dtype::complex<float>>::GEMM(handle,
+                                               cuTransB,
+                                               cuTransA,
+                                               static_cast<int>(N),
+                                               static_cast<int>(M),
+                                               static_cast<int>(K),
+                                               &c_alpha,
+                                               h_B,
+                                               static_cast<int>(ldb),
+                                               h_A,
+                                               static_cast<int>(lda),
+                                               &c_beta,
+                                               h_C,
+                                               static_cast<int>(N));
+    });
 #endif  // CUDA_VERSION >= 8000
   }
 }
@@ -1713,24 +1626,22 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #else
     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<phi::dtype::complex<double>>::GEMM(handle,
-                                                    cuTransB,
-                                                    cuTransA,
-                                                    static_cast<int>(N),
-                                                    static_cast<int>(M),
-                                                    static_cast<int>(K),
-                                                    &c_alpha,
-                                                    h_B,
-                                                    static_cast<int>(ldb),
-                                                    h_A,
-                                                    static_cast<int>(lda),
-                                                    &c_beta,
-                                                    h_C,
-                                                    static_cast<int>(N));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<phi::dtype::complex<double>>::GEMM(handle,
+                                                cuTransB,
+                                                cuTransA,
+                                                static_cast<int>(N),
+                                                static_cast<int>(M),
+                                                static_cast<int>(K),
+                                                &c_alpha,
+                                                h_B,
+                                                static_cast<int>(ldb),
+                                                h_A,
+                                                static_cast<int>(lda),
+                                                &c_beta,
+                                                h_C,
+                                                static_cast<int>(N));
+    });
 #endif  // CUDA_VERSION >= 8000
   }
 }
@@ -1769,7 +1680,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
   float h_beta = beta;
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -1784,30 +1695,28 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 12030
   } else {
     CheckGEMMNSize(N);
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(
-              phi::dynload::cublasGemmEx(handle,
-                                         cuTransB,
-                                         cuTransA,
-                                         static_cast<int>(N),
-                                         static_cast<int>(M),
-                                         static_cast<int>(K),
-                                         &h_alpha,
-                                         B,
-                                         CUDA_R_16BF,
-                                         static_cast<int>(ldb),
-                                         A,
-                                         CUDA_R_16BF,
-                                         static_cast<int>(lda),
-                                         &h_beta,
-                                         C,
-                                         CUDA_R_16BF,
-                                         static_cast<int>(N),
-                                         CUDA_R_32F,
-                                         algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmEx(handle,
+                                     cuTransB,
+                                     cuTransA,
+                                     static_cast<int>(N),
+                                     static_cast<int>(M),
+                                     static_cast<int>(K),
+                                     &h_alpha,
+                                     B,
+                                     CUDA_R_16BF,
+                                     static_cast<int>(ldb),
+                                     A,
+                                     CUDA_R_16BF,
+                                     static_cast<int>(lda),
+                                     &h_beta,
+                                     C,
+                                     CUDA_R_16BF,
+                                     static_cast<int>(N),
+                                     CUDA_R_32F,
+                                     algo));
+    });
   }
 #else
   // raise error
@@ -1860,24 +1769,22 @@ void Blas<phi::GPUContext>::GEMM(bool transA,
   } else {
 #endif  // CUDA_VERSION >= 8000
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<T>::GEMM(handle,
-                          cuTransB,
-                          cuTransA,
-                          N,
-                          M,
-                          K,
-                          &alpha,
-                          B,
-                          ldb,
-                          A,
-                          lda,
-                          &beta,
-                          C,
-                          ldc);
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM(handle,
+                      cuTransB,
+                      cuTransA,
+                      N,
+                      M,
+                      K,
+                      &alpha,
+                      B,
+                      ldb,
+                      A,
+                      lda,
+                      &beta,
+                      C,
+                      ldc);
+    });
 
 #if CUDA_VERSION >= 8000
   }
@@ -1904,24 +1811,22 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
   cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
   cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<phi::dtype::float16>::GEMM(handle,
-                                          cuTransB,
-                                          cuTransA,
-                                          N,
-                                          M,
-                                          K,
-                                          &alpha,
-                                          B,
-                                          ldb,
-                                          A,
-                                          lda,
-                                          &beta,
-                                          C,
-                                          ldc);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<phi::dtype::float16>::GEMM(handle,
+                                      cuTransB,
+                                      cuTransA,
+                                      N,
+                                      M,
+                                      K,
+                                      &alpha,
+                                      B,
+                                      ldb,
+                                      A,
+                                      lda,
+                                      &beta,
+                                      C,
+                                      ldc);
+  });
 }
 
 template <>
@@ -1957,36 +1862,33 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
   float h_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
   VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
 
-  TensorCoreCublasCallIfAvailable(
-      [&](cublasHandle_t handle) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cublasGemmEx(handle,
-                                       cuTransB,
-                                       cuTransA,
-                                       N,
-                                       M,
-                                       K,
-                                       &h_alpha,
-                                       B,
-                                       CUDA_R_16BF,
-                                       ldb,
-                                       A,
-                                       CUDA_R_16BF,
-                                       lda,
-                                       &h_beta,
-                                       C,
-                                       CUDA_R_16BF,
-                                       ldc,
-                                       CUBLAS_COMPUTE_32F,
-                                       algo));
-      },
-      dev_ctx_.stream());
+  dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
+                                                          cuTransB,
+                                                          cuTransA,
+                                                          N,
+                                                          M,
+                                                          K,
+                                                          &h_alpha,
+                                                          B,
+                                                          CUDA_R_16BF,
+                                                          ldb,
+                                                          A,
+                                                          CUDA_R_16BF,
+                                                          lda,
+                                                          &h_beta,
+                                                          C,
+                                                          CUDA_R_16BF,
+                                                          ldc,
+                                                          CUBLAS_COMPUTE_32F,
+                                                          algo));
+  });
 #else
   // raise error
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -1998,27 +1900,23 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
+  });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
-  CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
-  CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall(
+      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
 }
 
 template <>
@@ -2033,12 +1931,9 @@ void Blas<phi::GPUContext>::GEMV(bool trans_a,
                                  T *C) const {
   cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GEMV(
-            handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
+  });
 }
 
 template <>
@@ -2112,7 +2007,7 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
       std::is_same<T, phi::dtype::float16>::value) {
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx_.tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -2153,60 +2048,56 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
           "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
     } else {
-      TensorCoreCublasCallIfAvailable(
-          [&](cublasHandle_t handle) {
-            PADDLE_ENFORCE_GPU_SUCCESS(
-                phi::dynload::cublasGemmStridedBatchedEx(handle,
-                                                         cuTransB,
-                                                         cuTransA,
-                                                         N,
-                                                         M,
-                                                         K,
-                                                         a,
-                                                         B,
-                                                         fp,
-                                                         ldb,
-                                                         strideB,
-                                                         A,
-                                                         fp,
-                                                         lda,
-                                                         strideA,
-                                                         b,
-                                                         C,
-                                                         fp,
-                                                         ldc,
-                                                         strideC,
-                                                         batchCount,
-                                                         compute_type,
-                                                         algo));
-          },
-          dev_ctx_.stream());
+      dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            phi::dynload::cublasGemmStridedBatchedEx(handle,
+                                                     cuTransB,
+                                                     cuTransA,
+                                                     N,
+                                                     M,
+                                                     K,
+                                                     a,
+                                                     B,
+                                                     fp,
+                                                     ldb,
+                                                     strideB,
+                                                     A,
+                                                     fp,
+                                                     lda,
+                                                     strideA,
+                                                     b,
+                                                     C,
+                                                     fp,
+                                                     ldc,
+                                                     strideC,
+                                                     batchCount,
+                                                     compute_type,
+                                                     algo));
+      });
     }
   } else {
 #endif  // CUDA_VERSION >= 9010
 
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                        cuTransB,
-                                        cuTransA,
-                                        static_cast<int>(N),
-                                        static_cast<int>(M),
-                                        static_cast<int>(K),
-                                        &alpha,
-                                        B,
-                                        static_cast<int>(ldb),
-                                        strideB,
-                                        A,
-                                        static_cast<int>(lda),
-                                        strideA,
-                                        &beta,
-                                        C,
-                                        ldc,
-                                        strideC,
-                                        static_cast<int>(batchCount));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
+                                    cuTransB,
+                                    cuTransA,
+                                    static_cast<int>(N),
+                                    static_cast<int>(M),
+                                    static_cast<int>(K),
+                                    &alpha,
+                                    B,
+                                    static_cast<int>(ldb),
+                                    strideB,
+                                    A,
+                                    static_cast<int>(lda),
+                                    strideA,
+                                    &beta,
+                                    C,
+                                    ldc,
+                                    strideC,
+                                    static_cast<int>(batchCount));
+    });
 
 #if CUDA_VERSION >= 9010
   }
@@ -2242,7 +2133,7 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
       std::is_same<T, phi::dtype::float16>::value) {
     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = MetaxTensorCoreAvailable();
+    bool use_tensor_op_math = dev_ctx_.tensor_core_available();
     if (use_tensor_op_math) {
       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
     }
@@ -2284,61 +2175,57 @@ void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
           "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
     } else {
-      TensorCoreCublasCallIfAvailable(
-          [&](cublasHandle_t handle) {
-            PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
-                handle,
-                cuTransB,
-                cuTransA,
-                static_cast<int>(N),
-                static_cast<int>(M),
-                static_cast<int>(K),
-                a,
-                B,
-                fp,
-                static_cast<int>(ldb),
-                strideB,
-                A,
-                fp,
-                static_cast<int>(lda),
-                strideA,
-                b,
-                C,
-                fp,
-                static_cast<int>(ldc),
-                strideC,
-                static_cast<int>(batchCount),
-                compute_type,
-                algo));
-          },
-          dev_ctx_.stream());
+      dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
+            handle,
+            cuTransB,
+            cuTransA,
+            static_cast<int>(N),
+            static_cast<int>(M),
+            static_cast<int>(K),
+            a,
+            B,
+            fp,
+            static_cast<int>(ldb),
+            strideB,
+            A,
+            fp,
+            static_cast<int>(lda),
+            strideA,
+            b,
+            C,
+            fp,
+            static_cast<int>(ldc),
+            strideC,
+            static_cast<int>(batchCount),
+            compute_type,
+            algo));
+      });
     }
   } else {
 #endif  // CUDA_VERSION >= 9010
     T h_alpha = static_cast<T>(alpha);
     T h_beta = static_cast<T>(beta);
-    CublasCall(
-        [&](cublasHandle_t handle) {
-          CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                        cuTransB,
-                                        cuTransA,
-                                        static_cast<int>(N),
-                                        static_cast<int>(M),
-                                        static_cast<int>(K),
-                                        &h_alpha,
-                                        B,
-                                        static_cast<int>(ldb),
-                                        strideB,
-                                        A,
-                                        static_cast<int>(lda),
-                                        strideA,
-                                        &h_beta,
-                                        C,
-                                        static_cast<int>(ldc),
-                                        strideC,
-                                        static_cast<int>(batchCount));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
+                                    cuTransB,
+                                    cuTransA,
+                                    static_cast<int>(N),
+                                    static_cast<int>(M),
+                                    static_cast<int>(K),
+                                    &h_alpha,
+                                    B,
+                                    static_cast<int>(ldb),
+                                    strideB,
+                                    A,
+                                    static_cast<int>(lda),
+                                    strideA,
+                                    &h_beta,
+                                    C,
+                                    static_cast<int>(ldc),
+                                    strideC,
+                                    static_cast<int>(batchCount));
+    });
 
 #if CUDA_VERSION >= 9010
   }
@@ -2377,7 +2264,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   float h_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -2392,34 +2279,32 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
         "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
   } else {
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
-              handle,
-              cuTransB,
-              cuTransA,
-              static_cast<int>(N),
-              static_cast<int>(M),
-              static_cast<int>(K),
-              &h_alpha,
-              B,
-              CUDA_R_16BF,
-              static_cast<int>(ldb),
-              strideB,
-              A,
-              CUDA_R_16BF,
-              static_cast<int>(lda),
-              strideA,
-              &h_beta,
-              C,
-              CUDA_R_16BF,
-              static_cast<int>(ldc),
-              strideC,
-              static_cast<int>(batchCount),
-              CUBLAS_COMPUTE_32F,
-              algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmStridedBatchedEx(handle,
+                                                   cuTransB,
+                                                   cuTransA,
+                                                   static_cast<int>(N),
+                                                   static_cast<int>(M),
+                                                   static_cast<int>(K),
+                                                   &h_alpha,
+                                                   B,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldb),
+                                                   strideB,
+                                                   A,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(lda),
+                                                   strideA,
+                                                   &h_beta,
+                                                   C,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldc),
+                                                   strideC,
+                                                   static_cast<int>(batchCount),
+                                                   CUBLAS_COMPUTE_32F,
+                                                   algo));
+    });
   }
 #else
   // raise error
@@ -2460,7 +2345,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   float h_beta = beta;
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -2475,34 +2360,32 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
         "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
 #endif  // CUDA_VERSION >= 12030
   } else {
-    TensorCoreCublasCallIfAvailable(
-        [&](cublasHandle_t handle) {
-          PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
-              handle,
-              cuTransB,
-              cuTransA,
-              static_cast<int>(N),
-              static_cast<int>(M),
-              static_cast<int>(K),
-              &h_alpha,
-              B,
-              CUDA_R_16BF,
-              static_cast<int>(ldb),
-              strideB,
-              A,
-              CUDA_R_16BF,
-              static_cast<int>(lda),
-              strideA,
-              &h_beta,
-              C,
-              CUDA_R_16BF,
-              static_cast<int>(ldc),
-              strideC,
-              static_cast<int>(batchCount),
-              CUBLAS_COMPUTE_32F,
-              algo));
-        },
-        dev_ctx_.stream());
+    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cublasGemmStridedBatchedEx(handle,
+                                                   cuTransB,
+                                                   cuTransA,
+                                                   static_cast<int>(N),
+                                                   static_cast<int>(M),
+                                                   static_cast<int>(K),
+                                                   &h_alpha,
+                                                   B,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldb),
+                                                   strideB,
+                                                   A,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(lda),
+                                                   strideA,
+                                                   &h_beta,
+                                                   C,
+                                                   CUDA_R_16BF,
+                                                   static_cast<int>(ldc),
+                                                   strideC,
+                                                   static_cast<int>(batchCount),
+                                                   CUBLAS_COMPUTE_32F,
+                                                   algo));
+    });
   }
 #else
   // raise error
@@ -2547,7 +2430,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //        (std::is_same<float16, float>::value)) ||
 //       std::is_same<float16, phi::dtype::float16>::value) {
 //     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-//     bool use_tensor_op_math = MetaxTensorCoreAvailable();
+//     bool use_tensor_op_math = dev_ctx_.tensor_core_available();
 //     if (use_tensor_op_math) {
 //       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
 //     }
@@ -2579,7 +2462,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 // #endif
 //     }
 
-//     TensorCoreCublasCallIfAvailable(
+//     dev_ctx_.TensorCoreCublasCallIfAvailable(
 //         [&](cublasHandle_t handle) {
 //           PADDLE_ENFORCE_GPU_SUCCESS(
 //               phi::dynload::cublasGemmStridedBatchedEx(handle,
@@ -2605,12 +2488,11 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //                                                        batchCount,
 //                                                        compute_type,
 //                                                        algo));
-//         },
-//         dev_ctx_.stream());
+//         });
 //   } else {
 // #endif  // CUDA_VERSION >= 9010
 
-//     CublasCall(
+//     dev_ctx_.CublasCall(
 //         [&](cublasHandle_t handle) {
 //           CUBlas<float16>::GEMM_STRIDED_BATCH(handle,
 //                                               cuTransB,
@@ -2667,7 +2549,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //   cublasOperation_t cuTransB =
 //       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 //   const int64_t strideC = M * N;
-//   CublasCall(
+//   dev_ctx_.CublasCall(
 //       [&](cublasHandle_t handle) {
 //         PADDLE_ENFORCE_GPU_SUCCESS(
 //             phi::dynload::cublasDgemmStridedBatched(handle,
@@ -2723,14 +2605,14 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //   float h_beta = static_cast<float>(beta);
 
 //   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-//   bool use_tensor_op_math = MetaxTensorCoreAvailable();
+//   bool use_tensor_op_math = dev_ctx->tensor_core_available();
 //   if (use_tensor_op_math) {
 //     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
 //   }
 //   VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" :
 //   "False");
 
-//   TensorCoreCublasCallIfAvailable(
+//   dev_ctx_.TensorCoreCublasCallIfAvailable(
 //       [&](cublasHandle_t handle) {
 //         PADDLE_ENFORCE_GPU_SUCCESS(
 //             phi::dynload::cublasGemmStridedBatchedEx(handle,
@@ -2756,8 +2638,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 //                                                      batchCount,
 //                                                      CUBLAS_COMPUTE_32F,
 //                                                      algo));
-//       },
-//       dev_ctx_.stream());
+//       });
 // #else
 //   // raise error
 //   PADDLE_THROW(phi::errors::Unimplemented(
@@ -2812,25 +2693,23 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   thrust::device_vector<const double *> B_ptr(B, B + batchCount);
   thrust::device_vector<double *> C_ptr(C, C + batchCount);
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<double>::GEMM_BATCH(handle,
-                                   cuTransB,
-                                   cuTransA,
-                                   N,
-                                   M,
-                                   K,
-                                   &alpha,
-                                   B_ptr.data().get(),
-                                   ldb,
-                                   A_ptr.data().get(),
-                                   lda,
-                                   &beta,
-                                   C_ptr.data().get(),
-                                   ldc,
-                                   batchCount);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<double>::GEMM_BATCH(handle,
+                               cuTransB,
+                               cuTransA,
+                               N,
+                               M,
+                               K,
+                               &alpha,
+                               B_ptr.data().get(),
+                               ldb,
+                               A_ptr.data().get(),
+                               lda,
+                               &beta,
+                               C_ptr.data().get(),
+                               ldc,
+                               batchCount);
+  });
 }
 
 template <>
@@ -2859,25 +2738,23 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   thrust::device_vector<const float *> B_ptr(B, B + batchCount);
   thrust::device_vector<float *> C_ptr(C, C + batchCount);
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<float>::GEMM_BATCH(handle,
-                                  cuTransB,
-                                  cuTransA,
-                                  N,
-                                  M,
-                                  K,
-                                  &alpha,
-                                  B_ptr.data().get(),
-                                  ldb,
-                                  A_ptr.data().get(),
-                                  lda,
-                                  &beta,
-                                  C_ptr.data().get(),
-                                  ldc,
-                                  batchCount);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<float>::GEMM_BATCH(handle,
+                              cuTransB,
+                              cuTransA,
+                              N,
+                              M,
+                              K,
+                              &alpha,
+                              B_ptr.data().get(),
+                              ldb,
+                              A_ptr.data().get(),
+                              lda,
+                              &beta,
+                              C_ptr.data().get(),
+                              ldc,
+                              batchCount);
+  });
 }
 
 template <>
@@ -2970,7 +2847,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   float f_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = MetaxTensorCoreAvailable();
+  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
   if (use_tensor_op_math) {
     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
   }
@@ -2979,31 +2856,29 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   thrust::device_vector<const void *> A_ptr(A, A + batchCount);
   thrust::device_vector<const void *> B_ptr(B, B + batchCount);
   thrust::device_vector<void *> C_ptr(C, C + batchCount);
-  TensorCoreCublasCallIfAvailable(
-      [&](cublasHandle_t handle) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cublasGemmBatchedEx(handle,
-                                              cuTransB,
-                                              cuTransA,
-                                              N,
-                                              M,
-                                              K,
-                                              &f_alpha,
-                                              B_ptr.data().get(),
-                                              CUDA_R_16BF,
-                                              ldb,
-                                              A_ptr.data().get(),
-                                              CUDA_R_16BF,
-                                              lda,
-                                              &f_beta,
-                                              C_ptr.data().get(),
-                                              CUDA_R_16BF,
-                                              ldc,
-                                              batchCount,
-                                              CUBLAS_COMPUTE_32F,
-                                              algo));
-      },
-      dev_ctx_.stream());
+  dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cublasGemmBatchedEx(handle,
+                                          cuTransB,
+                                          cuTransA,
+                                          N,
+                                          M,
+                                          K,
+                                          &f_alpha,
+                                          B_ptr.data().get(),
+                                          CUDA_R_16BF,
+                                          ldb,
+                                          A_ptr.data().get(),
+                                          CUDA_R_16BF,
+                                          lda,
+                                          &f_beta,
+                                          C_ptr.data().get(),
+                                          CUDA_R_16BF,
+                                          ldc,
+                                          batchCount,
+                                          CUBLAS_COMPUTE_32F,
+                                          algo));
+  });
 #else
   // raise error
   PADDLE_THROW(phi::errors::Unimplemented(
@@ -3038,33 +2913,19 @@ void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
   cublasDiagType_t cuDiag =
       (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::TRSM(handle,
-                        cuSide,
-                        cuUplo,
-                        cuTransA,
-                        cuDiag,
-                        N,
-                        M,
-                        &alpha,
-                        A,
-                        lda,
-                        B,
-                        ldb);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM(
+        handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb);
+  });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedGETRF(
     int n, T **a, int *ipiv, int *info, int batch_size) const {
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
+  });
 }
 
 template <>
@@ -3084,23 +2945,18 @@ void Blas<phi::GPUContext>::BatchedGETRI(int n,
           "overlap memory space of input matrix (address: %p).",
           a_inv,
           a));
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GETRI_BATCH(
-            handle, n, a, n, ipiv, a_inv, n, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
+  });
 }
 
 template <>
 template <typename T>
 void Blas<phi::GPUContext>::BatchedMatInv(
     int n, const T **a, T **a_inv, int *info, int batch_size) const {
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
+  });
 }
 
 template <>
@@ -3118,12 +2974,10 @@ void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
   // use CUBLAS_OP_C (conjugate transpose) for complex
   cublasOperation_t cuTrans =
       (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::GETRS_BATCH(
-            handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::GETRS_BATCH(
+        handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
+  });
 }
 
 template <>
@@ -3152,23 +3006,21 @@ void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
   cublasDiagType_t cuDiag =
       (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
 
-  CublasCall(
-      [&](cublasHandle_t handle) {
-        CUBlas<T>::TRSM_BATCH(handle,
-                              cuSide,
-                              cuUplo,
-                              cuTransA,
-                              cuDiag,
-                              N,
-                              M,
-                              &alpha,
-                              A,
-                              lda,
-                              B,
-                              ldb,
-                              batch_size);
-      },
-      dev_ctx_.stream());
+  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+    CUBlas<T>::TRSM_BATCH(handle,
+                          cuSide,
+                          cuUplo,
+                          cuTransA,
+                          cuDiag,
+                          N,
+                          M,
+                          &alpha,
+                          A,
+                          lda,
+                          B,
+                          ldb,
+                          batch_size);
+  });
 }
 
 }  // namespace funcs
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_adam_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_adam_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_adam_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_adam_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_embedding_eltwise_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_embedding_eltwise_layernorm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_embedding_eltwise_layernorm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_embedding_eltwise_layernorm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_layernorm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_upper_triangle_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_upper_triangle_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_upper_triangle_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_upper_triangle_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_stack_transpose_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_stack_transpose_quant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_stack_transpose_quant_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_stack_transpose_quant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu
new file mode 100644
index 00000000000..08876233bfb
--- /dev/null
+++ b/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu
@@ -0,0 +1,30 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_swiglu_weighted_bwd_kernel.cu"  //NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_swiglu_weighted_bwd,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::FusedSwigluWeightedBwdKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          phi::bfloat16) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::BFLOAT16);
+  kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::BFLOAT16);
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_token_prune_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_token_prune_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_token_prune_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_token_prune_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_transpose_split_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_transpose_split_quant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_transpose_split_quant_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_transpose_split_quant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_transpose_wlch_split_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_transpose_wlch_split_quant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_transpose_wlch_split_quant_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_transpose_wlch_split_quant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index 62aaa5fb2de..a388387de45 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -15,25 +15,6 @@
 #include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
-const bool allow_tf32_cublas = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
-  if (v) {
-    return std::atoi(v);
-  }
-  return false;
-}();
-
-const bool allow_tf32_cudnn = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUDNN");
-  if (v) {
-    return std::atoi(v);
-  }
-  return false;
-}();
-
-bool AllowTF32Cublas() { return allow_tf32_cublas; }
-bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
-
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
@@ -87,20 +68,4 @@ static void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
   phi::dynload::hipblasLtCreate(blaslt_handle);
 #endif
 }
-
-blasLtHandle_t GetBlasLtHandle() {
-  std::call_once(flag_blaslt_, [&]() {
-    if (!blaslt_handle_) {
-      if (!blaslt_handle_creator_)
-        InitBlasLtHandle(&blaslt_handle_);
-      else
-        blaslt_handle_ = blaslt_handle_creator_();
-    }
-  });
-  PADDLE_ENFORCE_NOT_NULL(
-      blaslt_handle_,
-      common::errors::InvalidArgument(
-          "The GPU blasLt handle is nullptr. It must not be null."));
-  return blaslt_handle_;
-}
 }  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index a6610c1dab2..2339e18a4a6 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -128,8 +128,6 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle,
   }
 }
 
-bool AllowTF32Cublas();
-bool AllowTF32Cudnn();
 inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) {
   std::call_once(flag_cusolver_dn_, [&]() {
     if (!cusolver_dn_handle_) {

From 1af5148d20ce28e202fb0ac672f266c807d98b17 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Fri, 19 Sep 2025 18:31:14 +0800
Subject: [PATCH 36/95] [Metax] add log analysis script (#46)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script
---
 .../metax_gpu/tests/scripts/classify.json     |  22 ++
 .../metax_gpu/tests/scripts/log_analysis.py   | 216 ++++++++++++++++++
 2 files changed, 238 insertions(+)
 create mode 100644 backends/metax_gpu/tests/scripts/classify.json
 create mode 100644 backends/metax_gpu/tests/scripts/log_analysis.py

diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json
new file mode 100644
index 00000000000..b97255adc3d
--- /dev/null
+++ b/backends/metax_gpu/tests/scripts/classify.json
@@ -0,0 +1,22 @@
+{
+    "OK":{
+        "skipped":{
+            "rule":["skipped="]
+        }
+    },
+
+    "FAILED":{
+        "precision":{
+            "rule":["Mismatched elements"]
+        },
+        "api":{
+            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"]
+        },
+        "missing":{
+            "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"]
+        },
+        "file_not_found":{
+            "rule":["FileNotFoundError:"]
+        }
+    }
+}
diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py
new file mode 100644
index 00000000000..c0716f5b6f5
--- /dev/null
+++ b/backends/metax_gpu/tests/scripts/log_analysis.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import fnmatch
+import shutil
+from enum import Enum
+
+
+class TestResult(Enum):
+    OK = "OK"
+    FAILURE = "FAILED"
+
+
+class LogAnalyzer:
+    def __init__(
+        self,
+        classify_file: str,
+        search_path: str,
+        pattern: str = None,
+        encoding: str = "utf-8",
+    ):
+        self.__patten = pattern
+        self.__search_path = search_path
+        self.__encoding = encoding
+        self.__statistical_data = {}
+
+        self.__classify_data = self.__read_json_file(classify_file)
+        for key, value in self.__classify_data.items():
+            self.__statistical_data[key] = {}
+            for sub_key in list(value.keys()):
+                self.__statistical_data[key][sub_key] = []
+
+        self.__statistical_data[TestResult.OK.value]["noskip"] = []
+        self.__statistical_data[TestResult.FAILURE.value]["other"] = []
+
+    def __read_json_file(self, path: str) -> dict:
+        with open(path, "r", encoding=self.__encoding) as f:
+            data = json.load(f)
+        f.close()
+        return data
+
+    def __check_path(self, path: str) -> None:
+        """
+        处理指定路径：
+        - 若为文件夹路径：不存在则创建，存在则清空内容
+        - 若为文件路径：不存在则创建，存在则清空内容
+        """
+        try:
+            # 判断路径是否存在
+            if os.path.exists(path):
+                # 路径存在，判断是文件还是文件夹
+                if os.path.isfile(path):
+                    # 处理文件：清空内容
+                    with open(path, "w", encoding="utf-8") as f:
+                        f.write("")  # 写入空内容清空文件
+                    # print(f"文件已存在，已清空内容: {path}")
+
+                elif os.path.isdir(path):
+                    # 处理文件夹：清空所有内容
+                    for item in os.listdir(path):
+                        item_path = os.path.join(path, item)
+                        if os.path.isfile(item_path) or os.path.islink(item_path):
+                            os.remove(item_path)  # 删除文件或链接
+                        elif os.path.isdir(item_path):
+                            shutil.rmtree(item_path)  # 递归删除子文件夹
+                    # print(f"文件夹已存在，已清空内容: {path}")
+            else:
+                # 路径不存在，判断目标类型（根据最后一个元素是否有扩展名）
+                # 获取路径的最后一部分
+                last_part = os.path.basename(path)
+
+                # 判断是否为文件路径（包含扩展名）
+                if "." in last_part and not last_part.endswith("."):
+                    # 创建文件（包括父目录）
+                    parent_dir = os.path.dirname(path)
+                    if parent_dir and not os.path.exists(parent_dir):
+                        os.makedirs(parent_dir, exist_ok=True)
+                    with open(path, "w", encoding="utf-8") as f:
+                        pass  # 创建空文件
+                    # print(f"文件不存在，已创建: {path}")
+
+                else:
+                    # 创建文件夹（支持多级目录）
+                    os.makedirs(path, exist_ok=True)
+                    # print(f"文件夹不存在，已创建: {path}")
+
+        except PermissionError:
+            print(f"权限错误：无法操作路径 {path}")
+        except Exception as e:
+            print(f"处理路径时发生错误: {str(e)}")
+
+    def save_result(self, dir_path: str = "./") -> None:
+        """
+        判断文件夹是否存在：
+        - 不存在则创建
+        - 存在则清空文件夹内所有内容（保留文件夹本身）
+        """
+
+        for key, value in self.__statistical_data.items():
+            sub_dir = os.path.join(dir_path, key)
+            self.__check_path(sub_dir)
+
+            for sub_key, sub_value in value.items():
+                # print(f"{sub_key}: {len(value[sub_key])} - ({sub_value})")
+                try:
+                    with open(
+                        os.path.join(sub_dir, sub_key) + ".txt", "w", encoding="utf-8"
+                    ) as f:
+                        for op_name in sub_value:
+                            if not op_name.endswith("\n"):
+                                op_name += "\n"
+                            f.write(op_name)
+                    # print(f"内容已成功{'追加' if append else '写入'}到 {file_path}")
+                except Exception as e:
+                    print(f"写入文件失败: {e}")
+
+    def show_result(self) -> None:
+        test_counts = 0
+        for key, value in self.__statistical_data.items():
+            print(f"\n----------  {key}  ----------")
+            for sub_key, sub_value in value.items():
+                test_counts = test_counts + len(value[sub_key])
+                print(f"{sub_key}: {len(value[sub_key])}\n\t{sub_value}\n")
+        print(
+            f"\n******************* Total log num: {test_counts} *******************\n\n"
+        )
+
+    def run(self):
+        """
+        读取指定目录下符合命名规则的文件，并遍历每一行
+
+        参数:
+            search_path: 要搜索的根目录
+            pattern: 文件名匹配规则（支持通配符，如 '*.txt', 'file_*.log')
+        """
+        for dirpath, dirnames, filenames in os.walk(self.__search_path):
+            for filename in fnmatch.filter(filenames, self.__patten):
+                file_path = os.path.join(dirpath, filename)
+                # print(f"\n===== 正在处理文件: {file_path} =====")
+
+                cur_res_type = TestResult.FAILURE
+                cur_sub_type = "other"
+                pre_line = None
+                finish_early = False
+
+                try:
+                    with open(file_path, "r", encoding=self.__encoding) as f:
+                        for line in f:
+                            for sub_type, sub_type_params in self.__classify_data[
+                                cur_res_type.value
+                            ].items():
+                                for keyword in sub_type_params["rule"]:
+                                    if keyword in line:
+                                        cur_sub_type = sub_type
+                                        if sub_type == "missing":
+                                            finish_early = True
+                                        break
+
+                                if finish_early:
+                                    break
+
+                            pre_line = line
+                            if finish_early:
+                                break
+
+                        if "OK" in pre_line:
+                            cur_res_type = TestResult.OK
+                            cur_sub_type = None
+                            for sub_type, sub_type_params in self.__classify_data[
+                                cur_res_type.value
+                            ].items():
+                                for rule in sub_type_params["rule"]:
+                                    if rule in line:
+                                        cur_sub_type = sub_type
+
+                        op_name = filename.split(".")
+                        if cur_sub_type is None:
+                            self.__statistical_data[cur_res_type.value][
+                                "noskip"
+                            ].append(op_name[0])
+                        else:
+                            self.__statistical_data[cur_res_type.value][
+                                cur_sub_type
+                            ].append(op_name[0])
+                        # print(f"Result: {cur_res_type.value}, type: {cur_sub_type}")
+                    f.close()
+                except UnicodeDecodeError:
+                    print(f"警告: 文件 {file_path} 编码不是 utf-8,跳过处理")
+                except Exception as e:
+                    print(f"处理文件 {file_path} 时出错: {str(e)}")
+
+
+if __name__ == "__main__":
+
+    analyzer = LogAnalyzer(
+        classify_file="./classify.json",
+        search_path="./NPU_logs/20250918_065326",
+        pattern="test_*.log",
+    )
+
+    analyzer.run()
+    analyzer.show_result()
+    analyzer.save_result("./output")

From 518bee8382cdb7879f38e8b81e719aa8853b825e Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Fri, 19 Sep 2025 19:07:47 +0800
Subject: [PATCH 37/95] add_generate_pb (#47)

* add_generate_pb

---------
---
 backends/metax_gpu/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 7b8c52f1f31..78b4c9c566b 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -70,6 +70,7 @@ include(eigen)
 include(xxhash)
 include(zlib)
 include(protobuf)
+include(generate_pb)
 
 set(PROTO_FILE "${PADDLE_SOURCE_DIR}/paddle/phi/core/external_error.proto")
 get_filename_component(PROTO_WE "${PROTO_FILE}" NAME_WE)

From bc02549e7450cffb6b6925ef199b6f6fcbd63259 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 22 Sep 2025 16:44:28 +0800
Subject: [PATCH 38/95] modify blas (#51)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas

* modify blas

* modify blas

* modify blas
---
 backends/metax_gpu/CMakeLists.txt                    |  1 +
 .../metax_gpu/kernels/metax_kernel/metax_context.cc  | 12 ------------
 .../metax_gpu/kernels/metax_kernel/metax_context.h   |  4 +---
 backends/metax_gpu/patch/paddle.patch                |  1 -
 4 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 78b4c9c566b..b98f2bcc919 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -733,6 +733,7 @@ target_compile_definitions(
   ${TARGET_NAME}
   PUBLIC PADDLE_WITH_CUDA=1
          PADDLE_WITH_CUSTOM_DEVICE=1
+         mcblasContext=cublasContext
          GPUContext=CustomContext
          KPSContext=CustomContext
          STREAM_TYPE=cudaStream_t
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index a388387de45..6d86c81041f 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -56,16 +56,4 @@ void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
   allocation_.reset();
   allocation_ = allocator_->Allocate(required_workspace_bytes);
 }
-
-static std::function<blasLtHandle_t()> blaslt_handle_creator_{nullptr};
-static blasLtHandle_t blaslt_handle_{nullptr};
-static std::once_flag flag_blaslt_;
-
-static void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-  mcblasLtCreate(blaslt_handle);
-#elif defined(PADDLE_WITH_HIP)
-  phi::dynload::hipblasLtCreate(blaslt_handle);
-#endif
-}
 }  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 2339e18a4a6..376981f27a4 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -27,9 +27,7 @@
 #include "paddle/phi/core/attribute.h"
 #include "paddle/phi/core/device_context.h"
 
-using blasLtHandle_t = struct mcblasLtContext*;
-
-blasLtHandle_t GetBlasLtHandle();
+cublasLtHandle_t GetBlasLtHandle();
 
 namespace phi {
 class DnnWorkspaceHandle {
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index b7bdb953077..beefb730bf7 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -488,7 +488,6 @@ index 4eae698648..5c047723ea 100644
  #endif
    return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
  }
-
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h

From 1977ca87be51518f59506d37c08790938e4c1345 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 22 Sep 2025 17:31:21 +0800
Subject: [PATCH 39/95] [metax] modify tf32 (#52)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas

* modify blas

* modify blas

* modify blas

* modify context
---
 .../kernels/metax_kernel/metax_context.cc      | 18 ++++++++++++++++++
 .../kernels/metax_kernel/metax_context.h       |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index 6d86c81041f..efddba5f00b 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -15,6 +15,24 @@
 #include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
+const bool allow_tf32_cublas = []() -> bool {
+  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
+  if (v) {
+    return std::atoi(v);
+  }
+  return true;
+}();
+
+const bool allow_tf32_cudnn = []() -> bool {
+  const char* v = std::getenv("ALLOW_TF32_CUDNN");
+  if (v) {
+    return std::atoi(v);
+  }
+  return false;
+}();
+
+bool AllowTF32Cublas() { return allow_tf32_cublas; }
+bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 376981f27a4..2d761439089 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -30,6 +30,8 @@
 cublasLtHandle_t GetBlasLtHandle();
 
 namespace phi {
+bool AllowTF32Cublas();
+bool AllowTF32Cudnn();
 class DnnWorkspaceHandle {
  public:
   inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream)

From 1ae2618ac81e21e41b05797e08f1330eb504c4d5 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Mon, 22 Sep 2025 17:46:50 +0800
Subject: [PATCH 40/95] [Metax] update metax backend CI test (#53)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test
---
 backends/metax_gpu/tests/CMakeLists.txt | 192 +++++++++++-------------
 backends/metax_gpu/tests/default.txt    |  67 +++++++++
 backends/metax_gpu/tests/run_test.sh    |  56 ++++++-
 3 files changed, 202 insertions(+), 113 deletions(-)
 create mode 100644 backends/metax_gpu/tests/default.txt

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 795a3c5b8ac..ded54233f24 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -11,117 +11,95 @@ set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
 
 file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py")
 
-list(
-  APPEND
-  PYTHON_TEST_SCRIPTS
-  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_where_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_split_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fill_constant_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_empty_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_sign_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_unbind_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_put_along_axis_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_maximum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_strided_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_set_value_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_flatten_contiguous_range_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_subtract_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_greater_equal_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_one_hot_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fill_any_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_reshape_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_bitwise_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_pad_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_zeros_like_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_shape_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_bincount_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_assign_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fused_bias_act_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_adamw_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_nd_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_concat_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_nd_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_floordiv_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_mul_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_numel_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scale_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py)
-
-list(
-  REMOVE_ITEM
-  PYTHON_TEST_SCRIPTS
-  # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-  # core.cudnnversion
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  # op_test.py 里 self._get_places()接口的适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-  # device == "gpu" 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-  # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-  # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-  # needs check_grad with fp64 precision
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-  # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+if(NOT TEST_LIST_FILE)
+  message(
+    STATUS
+      "<TEST_LIST_FILE> is not set, default test list [ ${CMAKE_CURRENT_LIST_DIR}/default.txt ] will be used."
+  )
+  file(STRINGS ${CMAKE_CURRENT_LIST_DIR}/default.txt TEST_PROGRAMS)
+
+else()
+  if(NOT EXISTS ${TEST_LIST_FILE})
+    message(FATAL_ERROR "<TEST_LIST_FILE> is not exist, please check it again.")
+  endif()
+
+  file(STRINGS ${TEST_LIST_FILE} TEST_PROGRAMS)
+
+  if(NOT TEST_PROGRAMS)
+    message(FATAL_ERROR "<TEST_LIST_FILE> is empty.")
+  endif()
+
+  set(PYTHON_TEST_SCRIPTS "")
+endif()
+
+foreach(test_name ${TEST_PROGRAMS})
+  set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+  if(NOT EXISTS ${CURRENT_TEST_PROGRAM})
+    message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.")
+  else()
+    list(APPEND PYTHON_TEST_SCRIPTS ${CURRENT_TEST_PROGRAM})
+  endif()
+endforeach()
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
+
+if(NOT TEST_LIST_FILE)
+  list(
+    REMOVE_ITEM
+    PYTHON_TEST_SCRIPTS
+    # 精度问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+    # core.cudnnversion
+    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+    # op_test.py 里 self._get_places()接口的适配问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
+    # device == "gpu" 适配问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+    # paddle-gpu 报错一致
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+    # paddle.device.cuda.get_device_properties
+    ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
+    # needs check_grad with fp64 precision
+    ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+    # CUDAPinnedPlace 问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+endif()
+
+if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR})
+  file(MAKE_DIRECTORY ${LOG_OUTPUT_DIR})
+  message(WARNING "${LOG_OUTPUT_DIR} is not exist, create it now.")
+endif()
+
 foreach(test_script ${PYTHON_TEST_SCRIPTS})
   get_filename_component(test_name ${test_script} NAME_WE)
 
-  add_test(
-    NAME "python_${test_name}"
-    COMMAND ${Python_EXECUTABLE} ${test_script}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  if(LOG_OUTPUT_DIR)
+    set(test_log_file "${LOG_OUTPUT_DIR}/${test_name}.log")
+
+    add_test(
+      NAME "python_${test_name}"
+      COMMAND sh -c
+              "${Python_EXECUTABLE} ${test_script} > ${test_log_file} 2>&1"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+  else()
+    add_test(
+      NAME "python_${test_name}"
+      COMMAND ${Python_EXECUTABLE} ${test_script}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+
   set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360)
 endforeach()
diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
new file mode 100644
index 00000000000..8e2c3bcdd7e
--- /dev/null
+++ b/backends/metax_gpu/tests/default.txt
@@ -0,0 +1,67 @@
+test_accuracy_op
+test_tril_triu_op
+test_where_op
+test_split_op
+test_fill_constant_op
+test_empty_op
+test_sign_op
+test_cast_op
+test_index_add_op
+test_unbind_op
+test_put_along_axis_op
+test_layer_norm_op
+test_maximum_op
+test_accuracy_op
+test_strided_slice_op
+test_sum_op
+test_set_value_op
+test_flatten_contiguous_range_op
+test_top_k_op
+test_subtract_op
+test_softmax_op
+test_cumsum_op
+test_greater_equal_op
+test_elementwise_div_op
+test_top_k_v2_op
+test_stack_op
+test_one_hot_v2_op
+test_fill_any_op
+test_gather_op
+test_reshape_op
+test_index_put_op
+test_bitwise_op
+test_max_op
+test_pad_op
+test_elementwise_pow_op
+test_uniform_random_op
+test_scatter_op
+test_cast_op
+test_zeros_like_op
+test_compare_op
+test_shape_op
+test_tril_triu_op
+test_slice_op
+test_elementwise_add_op
+test_index_put_op
+test_bincount_op
+test_assign_op
+test_logical_op
+test_squared_l2_norm_op
+test_mean_op
+test_fused_bias_act_op
+test_expand_v2_op
+test_adamw_op
+test_gather_nd_op
+test_concat_op
+test_scatter_nd_op
+test_elementwise_floordiv_op
+test_elementwise_mul_op
+test_transpose_op
+test_einsum_op
+test_randint_op
+test_c_embedding_op
+test_numel_op
+test_scale_op
+test_softmax_with_cross_entropy_op
+test_full_op
+test_scatter_op
diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 7d1e8e072a9..b9e8ec5b5cc 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,10 +29,54 @@ export
 rm -r build
 mkdir -p build && cd build
 
-cmake ..
 
+TEST_LOG_LEVEL=0
+TEST_LIST_FILE=""
+TEST_LOG_OUTPUT_DIR=""
+TEST_PARALLEL_NUM=10
 
-cmake --build .
+while getopts "i:o:v:j:h" opt; do
+  case "$opt" in
+    i)
+      TEST_LIST_FILE="$OPTARG"
+      ;;
+    o)
+      TEST_LOG_OUTPUT_DIR="$OPTARG"
+      echo "Set log output dir [ $TEST_LOG_OUTPUT_DIR ]"
+      ;;
+    v)
+      TEST_LOG_LEVEL=$OPTARG
+      ;;
+    j)
+      TEST_PARALLEL_NUM="$OPTARG"
+      ;;
+    h)
+      echo "用法：$0 -i <测试列表文件> -o <日志输出路径> ..."
+      echo "选项说明："
+      echo "  -i  测试程序列表文件"
+      echo "  -o  日志输出路径"
+      echo "  -v  GLOG_v 日志等级"
+      echo "  -j  ctest 测试并行数量"
+      echo "  -h  显示帮助"
+      exit 0
+      ;;
+    \?)
+      echo "error: unknow option '-$OPTARG'."
+      exit 1
+      ;;
+    :)
+      echo "error option '-$OPTARG' must have parameter."
+      exit 1
+      ;;
+  esac
+done
+
+
+export GLOG_v=$TEST_LOG_LEVEL
 
 
-ctest -j10 --output-on-failure
+cmake .. -DTEST_LIST_FILE=$TEST_LIST_FILE -DLOG_OUTPUT_DIR=$TEST_LOG_OUTPUT_DIR
+
+cmake --build .
+
+ctest -j$TEST_PARALLEL_NUM --output-on-failure

From 76d5eb0245904cc209e52dd9fa92dea990db1ad7 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 23 Sep 2025 09:43:37 +0800
Subject: [PATCH 41/95] [Metax] fix log_analysis.py bug (#54)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test

* [Metax] fix log_analysis.py bug
---
 .../metax_gpu/tests/scripts/log_analysis.py   | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py
index c0716f5b6f5..963d50751f7 100644
--- a/backends/metax_gpu/tests/scripts/log_analysis.py
+++ b/backends/metax_gpu/tests/scripts/log_analysis.py
@@ -153,7 +153,6 @@ def run(self):
 
                 cur_res_type = TestResult.FAILURE
                 cur_sub_type = "other"
-                pre_line = None
                 finish_early = False
 
                 try:
@@ -172,19 +171,19 @@ def run(self):
                                 if finish_early:
                                     break
 
-                            pre_line = line
                             if finish_early:
                                 break
 
-                        if "OK" in pre_line:
-                            cur_res_type = TestResult.OK
-                            cur_sub_type = None
-                            for sub_type, sub_type_params in self.__classify_data[
-                                cur_res_type.value
-                            ].items():
-                                for rule in sub_type_params["rule"]:
-                                    if rule in line:
-                                        cur_sub_type = sub_type
+                            if len(line) >= 2 and line[:2] == "OK":
+                                cur_res_type = TestResult.OK
+                                cur_sub_type = None
+                                for sub_type, sub_type_params in self.__classify_data[
+                                    cur_res_type.value
+                                ].items():
+                                    for rule in sub_type_params["rule"]:
+                                        if rule in line:
+                                            cur_sub_type = sub_type
+                                break
 
                         op_name = filename.split(".")
                         if cur_sub_type is None:

From 9c17b6e0867119ea51c1c4230603f2a34137ac68 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 23 Sep 2025 11:09:44 +0800
Subject: [PATCH 42/95] [Metax] update metax CI CMakeLists & scripts (#56)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test

* [Metax] fix log_analysis.py bug

* [Metax] update metax CI CMakeLists & scripts
---
 .github/workflows/metax_work.yaml             |  2 +-
 backends/metax_gpu/tests/CMakeLists.txt       |  4 ++-
 backends/metax_gpu/tests/run_test.sh          |  2 +-
 .../metax_gpu/tests/scripts/classify.json     | 31 +++++++++++++++++--
 4 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 51c0c62cef6..aff530d475c 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -51,4 +51,4 @@ jobs:
       - name: run test
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh
+          bash run_test.sh -j 16
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index ded54233f24..5b7be15e4f9 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -47,6 +47,8 @@ if(NOT TEST_LIST_FILE)
   list(
     REMOVE_ITEM
     PYTHON_TEST_SCRIPTS
+    # Metax unit test
+    ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py
     # 精度问题
     ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
     ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
@@ -101,5 +103,5 @@ foreach(test_script ${PYTHON_TEST_SCRIPTS})
       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 
-  set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360)
+  set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 600)
 endforeach()
diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index b9e8ec5b5cc..7f2277fe4fb 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -33,7 +33,7 @@ mkdir -p build && cd build
 TEST_LOG_LEVEL=0
 TEST_LIST_FILE=""
 TEST_LOG_OUTPUT_DIR=""
-TEST_PARALLEL_NUM=10
+TEST_PARALLEL_NUM=1
 
 while getopts "i:o:v:j:h" opt; do
   case "$opt" in
diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json
index b97255adc3d..ca92ad4a0a4 100644
--- a/backends/metax_gpu/tests/scripts/classify.json
+++ b/backends/metax_gpu/tests/scripts/classify.json
@@ -7,13 +7,38 @@
 
     "FAILED":{
         "precision":{
-            "rule":["Mismatched elements"]
+            "rule":["Mismatched elements",
+            "RuntimeError: Jacobian mismatch for output 0 in y with respect to input 0 in x on Place(metax_gpu:0),",
+            "AssertionError: np.float64("]
         },
         "api":{
-            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"]
+            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace",
+            "ValueError: The API paddle.device.cuda.get_device_properties",
+            "TypeError: paddle.index_add api",
+            "RuntimeError: (Unavailable) Paddle is not compiled with CUDA.",
+            "ValueError: invalid literal for int() with base",
+            "AttributeError: module 'paddle.base.libpaddle' has no attribute 'cudnn_version'",
+            "RuntimeError: Pinning memory is not supported for Place(metax_gpu:0)",
+            "PreconditionNotMetError: Context place error, excepted GPUPlace, but actually Place(metax_gpu:0).",
+            "AttributeError: module 'paddle.base.libpaddle.eager.ops.legacy' has no attribute 'fused_gemm_epilogue'",
+            "ValueError: The device should not be 'gpu', since PaddlePaddle is not compiled with CUDA"]
         },
         "missing":{
-            "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"]
+            "rule":["missing metax_gpu kernel",
+            "missing ONEDNN kernel",
+            "UnimplementedError: There are no kernels which are registered",
+            "symbol lookup error:",
+            "RuntimeError: (NotFound) The kernel"]
+        },
+        "core_dumped":{
+            "rule":["Segmentation fault"]
+        },
+        "input_dim":{
+            "rule":["ValueError: (InvalidArgument) The Input(",
+            "Test range of input is out of bound"]
+        },
+        "array_dim":{
+            "rule":["Arrays are not equal"]
         },
         "file_not_found":{
             "rule":["FileNotFoundError:"]

From 51c98a20020ba61b2bfab54abf11668a9f40e0b6 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 23 Sep 2025 19:11:49 +0800
Subject: [PATCH 43/95] [Metax] fix MatmulKernel problem (#57)

* [Metax] fix dgc & mklml compile product path problem

* [Metax] update metax_gpu CMakeLists.txt

* [Metax] organize documents

* [Metax] add log analysis script

* [Metax] update metax backend CI test

* [Metax] fix log_analysis.py bug

* [Metax] update metax CI CMakeLists & scripts

* [Metax] fix MatmulKernel problem

* [Metax] update metax CI program
---
 .../kernels/impl/matmul_kernel_impl.h         |  19 +-
 backends/metax_gpu/tests/CMakeLists.txt       |   2 +-
 backends/metax_gpu/tests/default.txt          | 258 ++++++++++++
 ...r_equal.py => test_greater_equal_metax.py} |   0
 ...ild_src_rank_and_local_expert_id_metax.py} |   0
 ...cubate_expand_modality_expert_id_metax.py} |   0
 ....py => test_incubate_moe_combine_metax.py} |   0
 ...e_dispatch_partial_nosoftmaxtopk_metax.py} |   0
 ..._moe_gate_dispatch_w_permute_bwd_metax.py} |   0
 ...bate_moe_gate_dispatch_w_permute_metax.py} |   0
 ...layer_norm.py => test_layer_norm_metax.py} |   0
 ...l_op__metax.py => test_matmul_op_metax.py} |   0
 ...mpling.py => test_top_p_sampling_metax.py} |   0
 .../tests/unittest/test_matmul_op__metax.py   | 395 ------------------
 14 files changed, 272 insertions(+), 402 deletions(-)
 rename backends/metax_gpu/tests/unit_test/{test_greater_equal.py => test_greater_equal_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_build_src_rank_and_local_expert_id.py => test_incubate_build_src_rank_and_local_expert_id_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_expand_modality_expert_id.py => test_incubate_expand_modality_expert_id_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_combine.py => test_incubate_moe_combine_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py => test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute_bwd.py => test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute.py => test_incubate_moe_gate_dispatch_w_permute_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_layer_norm.py => test_layer_norm_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_matmul_op__metax.py => test_matmul_op_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_top_p_sampling.py => test_top_p_sampling_metax.py} (100%)
 delete mode 100644 backends/metax_gpu/tests/unittest/test_matmul_op__metax.py

diff --git a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
index bf228c81291..5221bd93ba9 100755
--- a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
@@ -40,6 +40,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
 #include "paddle/phi/kernels/autotune/auto_tune_base.h"
 #endif
+#include "paddle/phi/kernels/full_kernel.h"
 // clang-format on
 namespace phi {
 
@@ -1485,16 +1486,22 @@ void MatmulKernel(const Context& ctx,
                   bool transpose_x,
                   bool transpose_y,
                   DenseTensor* out) {
-  PADDLE_ENFORCE_NE(
+  if (x.numel() == 0 || y.numel() == 0) {
+    // input shape [1, 1, 5, 0], [1, 1, 0, 5], result shape is [1, 1, 5, 5]
+    phi::Full<T, Context>(
+        ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
+    return;
+  }
+  PADDLE_ENFORCE_GE(
       common::product(x.dims()),
       0,
-      phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(
+      common::errors::InvalidArgument(
+          "The dims of Input(X) should be greater than or equal to 0."));
+  PADDLE_ENFORCE_GE(
       common::product(y.dims()),
       0,
-      phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
+      common::errors::InvalidArgument(
+          "The dims of Input(Y) should be greater than or equal to 0."));
   const std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
   const std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
   MatmulJudgeDtypeKernel<Context, T>(
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 5b7be15e4f9..e8b11d347d9 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -48,7 +48,7 @@ if(NOT TEST_LIST_FILE)
     REMOVE_ITEM
     PYTHON_TEST_SCRIPTS
     # Metax unit test
-    ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py
+    ${METAX_UNIT_TEST_PATH}/test_matmul_op_metax.py
     # 精度问题
     ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
     ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 8e2c3bcdd7e..9f073d7e92f 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -65,3 +65,261 @@ test_scale_op
 test_softmax_with_cross_entropy_op
 test_full_op
 test_scatter_op
+test_assign_pos_op
+test_index_select_compatible
+test_dequantize_abs_max_op
+test_fill_any_op
+test_fractional_max_pool3d_api
+test_nll_loss
+test_is_empty_op
+test_norm_nn_grad
+test_index_fill
+test_floor
+test_slice_scatter
+test_nn_matmul_v2_grad
+test_matmul_op_with_head
+test_broadcast_shape
+test_fill_constant_op
+test_decayed_adagrad_op
+test_count_nonzero_api
+test_tensor_fill_
+test_minimum_op
+test_sigmoid_focal_loss
+test_dynamic_rnn_stop_gradient
+test_ops_roi_align
+test_split_op
+test_sum_decorator
+test_share_data_op
+test_assert_op
+test_masked_select_op
+test_tensor_fill_diagonal_tensor_
+test_unfold_op
+test_scatter_add_op
+test_flatten_contiguous_range_op
+test_empty_like_op
+test_logsumexp
+test_multiply
+test_ceil_op
+test_nearest_interp_v2_op
+test_incubate_expand_modality_expert_id
+test_bmm_op
+test_prelu_op
+test_batch_fc_op
+test_masked_fill
+test_overlap_add_op
+test_update_loss_scaling_op
+test_floor_divide_op
+test_increment
+test_complex_abs
+test_gather_compatible
+test_functional_conv2d
+test_group_norm_op_v2
+test_conv2d_transpose_op_depthwise_conv
+test_diagonal_op
+test_maximum_op
+test_erfinv_op
+test_interp_recompute_scale_factor
+test_embedding_scale_grad_by_freq
+test_diagonal_scatter
+test_higher_dim_scatter
+test_infer_shape
+test_flip
+test_fused_bias_dropout_residual_layer_norm_op
+test_greater_equal_op
+test_add_op
+test_cartesian_prod
+test_uniform_random_inplace_op
+test_feed_fetch_method
+test_pow_op
+test_conv3d_transpose_op
+test_add_position_encoding_op
+test_imperative_data_loader_base
+test_rnn_cell_api
+test_linspace
+test_adaptive_log_softmax_with_loss
+test_cross_entropy2_op
+test_complex_reshape
+test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk
+test_gaussian_nll_loss
+test_log_normal
+test_unstack_op
+test_expand_as_v2_op
+test_dequantize_log_op
+test_complex_sum_layer
+test_slice_var
+test_scale_op
+test_hinge_embedding_loss
+test_set_value_op
+test_merged_adam_op
+test_index_sample_op
+test_cuda_empty_cache
+test_add_n_op
+test_randint_like
+test_unique_consecutive_op
+test_fill_diagonal_tensor_op
+test_log_loss_op
+test_linalg_cholesky_inverse
+test_numel_op
+test_tril_triu_op
+test_adaptive_max_pool2d
+test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad
+test_complex_cast
+test_poisson_nll_loss
+test_empty_op
+test_functional_conv1d_transpose
+test_clip_by_norm_op
+test_box_clip_op
+test_clip_op
+test_grad_clip_minimize
+test_less_than_op
+test_adamw_op
+test_data_feeder
+test_top_p_sampling
+test_subtract_op
+test_batch_norm_op_v2
+test_cosine_embedding_loss
+test_imperative_data_parallel
+test_sigmoid
+test_adaptive_max_pool3d
+test_roll_op
+test_index_put_op
+test_assign_op
+test_amp_check_finite_and_scale_op
+test_strided_slice_op
+test_label_smooth_functional
+test_c_softmax_with_cross_entropy_op
+test_sync_batch_norm_op_convert
+test_tensor_fill_diagonal_tensor
+test_bfloat16_embedding
+test_gelu_op
+test_full_
+test_concat_op
+test_imperative_data_loader_process
+test_tensor_fill_diagonal_
+test_clip_grad_norm_
+test_eager_deletion_padding_rnn
+test_pool2d_api
+test_clip_grad_value_
+test_isfinite_v2_op
+test_nn_sigmoid_op
+test_adaptive_avg_pool2d
+test_size
+test_sigmoid_cross_entropy_with_logits_op
+test_scatter_reduce_op
+test_rsqrt
+test_conv2d_transpose_layer
+test_scatter_compatible
+test_scatter_nd_op
+test_add_op_fluid
+test_unique
+test_compat_split_static
+test_stack_op
+test_tile_op
+test_adam_optimizer_fp32_fp64
+test_batch_norm_op
+test_gather_nd_op
+test_pow
+test_executor_check_fetch_list
+test_inplace_softmax_with_cross_entropy
+test_cos
+test_imperative_parallel_coalesce_split
+test_grid_sample_function
+test_rnn_decode_api
+test_triu_indices_op
+test_binary_cross_entropy_with_logits_op
+test_mean_op_v1
+test_round_op
+test_assign_pos_op_dygraph
+test_nn_functional_embedding_static
+test_norm_op
+test_unbind_op
+test_bilinear_interp_v2_op
+test_tensor_data_ptr
+test_norm_all
+test_conv1d_transpose_layer
+test_arange
+test_compat_unfold
+test_fetch_var
+test_index_select_op
+test_sign_op
+test_functional_conv3d_transpose
+test_uniform_random_bf16_op
+test_gather_tree_op
+test_histogram_bin_edges_op
+test_fractional_max_pool2d_api
+test_fill_any_like_op
+test_alpha_dropout
+test_conv3d_layer
+test_compat_pad
+test_box_coder_op
+test_full_op
+test_repeat_interleave_op
+test_reshape_op
+test_embedding_renorm
+test_log_softmax
+test_pad3d_op
+test_diag_v2
+test_complex_transpose
+test_prior_box_op
+test_square_error_cost
+test_fused_rotary_position_embedding
+test_gru_rnn_op
+test_restrict_nonzero
+test_dygraph_weight_norm
+test_conv_transpose_nn_grad
+test_incubate_build_src_rank_and_local_expert_id
+test_elementwise_nn_grad
+test_fused_bias_dropout_residual_layer_norm_op_api
+test_simple_rnn_op
+test_data_generator
+test_compat_split
+test_scatter_add_inplace_op
+test_c_softmax_with_multi_label_cross_entropy_op
+test_conv3d_transpose_layer
+test_less_equal_op
+test_gumbel_softmax_op
+test_assign_value_op
+test_cast_op
+test_fused_bias_act_op
+test_conv3d_transpose_part2_op
+test_log
+test_data
+test_incubate_moe_combine
+test_masked_scatter
+test_silu_op
+test_select_scatter_op
+test_adagrad_op_v2
+test_functional_conv3d
+test_bce_with_logits_loss
+test_argsort_op
+test_layer_norm_op_v2
+test_adaptive_max_pool1d
+test_shard_index_op
+test_cuda_max_memory_allocated
+test_roi_align_op
+test_sin
+test_take
+test_take_along_dim
+test_complex_matmul
+test_reduce_as_op
+test_log_normal_inplace
+test_repeat
+test_fetch_lod_tensor_array
+test_partial_concat_op
+test_accuracy_op
+test_l1_norm_op
+test_bce_loss
+test_fused_conv2d_add_act_op
+test_tril_indices_op
+test_cross_entropy_op
+test_blha_get_max_len_op
+test_softmax_mask_fuse_op
+test_diag_embed
+test_one_hot_v2_op
+test_selu_op
+test_huber_loss_op
+test_einsum_op
+test_dygraph_spectral_norm
+test_block_diag
+test_index_elementwise
+test_matmul_out
diff --git a/backends/metax_gpu/tests/unit_test/test_greater_equal.py b/backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_greater_equal.py
rename to backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_layer_norm.py b/backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_layer_norm.py
rename to backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py b/backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
rename to backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py b/backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
rename to backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py
diff --git a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py b/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py
deleted file mode 100644
index 7545e16d14d..00000000000
--- a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py
+++ /dev/null
@@ -1,395 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-from tests.op_test import OpTest
-import paddle
-
-paddle.enable_static()
-SEED = 2022
-
-
-def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
-    """Reference forward implementation using np.matmul."""
-    # np.matmul does not support the transpose flags, so we manually
-    # transpose X and Y appropriately.
-    if transpose_X:
-        if X.ndim == 1:
-            X = X.reshape((X.size,))
-        elif X.ndim == 2:
-            X = X.T
-        else:
-            dim = [i for i in range(len(X.shape))]
-            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
-            X = np.transpose(X, tuple(dim))
-    if transpose_Y:
-        if Y.ndim == 1:
-            Y = Y.reshape((Y.size,))
-        else:
-            dim = [i for i in range(len(Y.shape))]
-            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
-            Y = np.transpose(Y, tuple(dim))
-
-    Out = np.matmul(X, Y)
-    if abs(scale - 1.0) > 1e-09:
-        Out = Out * scale
-    return Out
-
-
-class TestBmmOp(OpTest):
-    """
-    case 0
-    """
-
-    def set_metax_gpu(self):
-        self.__class__.use_custom_device = True
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-
-    def config(self):
-        self.x_shape = (10, 2, 5)
-        self.y_shape = (10, 5, 8)
-
-    def init_kernel_type(self):
-        self.dtype = "float32"
-
-    def setUp(self):
-        self.set_metax_gpu()
-        self.init_kernel_type()
-        self.config()
-        self.op_type = "bmm"
-        x = np.random.random(self.x_shape).astype(self.dtype)
-        y = np.random.random(self.y_shape).astype(self.dtype)
-        # -0.1 ~ 0.1
-        x = -0.1 + 0.2 * x
-        y = -0.1 + 0.2 * y
-        result = reference_matmul(x, y)
-        result = result.astype(self.dtype)
-        self.inputs = {
-            "X": x,
-            "Y": y,
-        }
-        self.outputs = {"Out": result}
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestBmmOp1(TestBmmOp):
-    """
-    case 1
-    """
-
-    def config(self):
-        self.x_shape = (40, 10, 10)
-        self.y_shape = (40, 10, 10)
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestBmmOp2(TestBmmOp):
-    """
-    case 2
-    """
-
-    def config(self):
-        self.x_shape = (4, 10, 80)
-        self.y_shape = (4, 80, 1)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place,
-            ["X", "Y"],
-            "Out",
-            max_relative_error=1e-2,
-        )
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-
-class TestMatMulOp(OpTest):
-    """
-    basic case
-    """
-
-    def setUp(self):
-        self.set_metax_gpu()
-        self.op_type = "matmul_v2"
-        self.init_dtype()
-        self.init_alpha()
-        self.config()
-
-        X = np.random.random(self.x_shape).astype(self.dtype)
-        Y = np.random.random(self.y_shape).astype(self.dtype)
-        # -0.1 ~ 0.1
-        X = -0.1 + 0.2 * X
-        Y = -0.1 + 0.2 * Y
-        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, self.alpha)
-        Out = Out.astype(self.dtype)
-        self.inputs = {"X": X, "Y": Y}
-        self.attrs = {
-            "trans_x": self.transpose_X,
-            "trans_y": self.transpose_Y,
-            "alpha": self.alpha,
-        }
-        self.outputs = {"Out": Out}
-
-    def set_metax_gpu(self):
-        self.__class__.use_custom_device = True
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-
-    def config(self):
-        self.x_shape = (100,)
-        self.y_shape = (100,)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-    def init_alpha(self):
-        self.alpha = 1.0
-
-    def init_dtype(self):
-        self.dtype = "float32"
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-7)
-
-    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestMatMulOp1(TestMatMulOp):
-    """
-    case x_ndim == 1, y_ndim != 1
-    """
-
-    def config(self):
-        self.x_shape = (100,)
-        self.y_shape = (1, 3, 2, 100)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp2(TestMatMulOp):
-    """
-    case x_ndim != 1, y_ndim == 1
-    """
-
-    def config(self):
-        self.x_shape = (1, 2, 100, 1)
-        self.y_shape = (100,)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp3(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 100)
-        self.y_shape = (100, 2)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp4(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 100)
-        self.y_shape = (2, 100)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp5(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (100, 2)
-        self.y_shape = (100, 2)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp6(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 2, 25)
-        self.y_shape = (25, 4)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp7(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (1, 4, 25)
-        self.y_shape = (4, 25)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp8(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (1, 25, 4)
-        self.y_shape = (25, 4)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp9(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 5, 10)
-        self.y_shape = (2, 10, 5)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp10(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 10, 5)
-        self.y_shape = (2, 10, 5)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp11(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 5, 10)
-        self.y_shape = (2, 5, 10)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp12(TestMatMulOp):
-    """
-    case to check the gradient for special case
-    """
-
-    def config(self):
-        self.x_shape = 100
-        self.y_shape = (1, 2, 2, 100, 2)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp13(TestMatMulOp):
-    """
-    case to check the gradient for special case
-    """
-
-    def config(self):
-        self.x_shape = (2, 1, 100)
-        self.y_shape = 100
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-# TODO(metax_gpu): alpha will be supported in next version
-# --------------------test matmul alpha--------------------
-# def create_test_alpha_class(parent):
-#     class TestMatMulOpAlphaCase(parent):
-#         def init_alpha(self):
-#             self.alpha = 0.125
-
-#     cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
-#     TestMatMulOpAlphaCase.__name__ = cls_name
-#     globals()[cls_name] = TestMatMulOpAlphaCase
-
-# create_test_alpha_class(TestMatMulOp)
-# create_test_alpha_class(TestMatMulOp1)
-# create_test_alpha_class(TestMatMulOp2)
-# create_test_alpha_class(TestMatMulOp3)
-# create_test_alpha_class(TestMatMulOp4)
-# create_test_alpha_class(TestMatMulOp5)
-# create_test_alpha_class(TestMatMulOp6)
-# create_test_alpha_class(TestMatMulOp9)
-# create_test_alpha_class(TestMatMulOp10)
-# create_test_alpha_class(TestMatMulOp11)
-# create_test_alpha_class(TestMatMulOp12)
-# create_test_alpha_class(TestMatMulOp13)
-
-
-# --------------------test matmul fp16--------------------
-def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
-    class TestMatMulOpFp16Case(parent):
-        def init_kernel_type(self):
-            self.dtype = np.float16
-
-        def test_check_output(self):
-            self.check_output_with_place(self.place, atol=atol)
-
-        def test_check_grad(self):
-            self.check_grad_with_place(
-                self.place, ["X", "Y"], "Out", max_relative_error=max_relative_error
-            )
-
-    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
-    TestMatMulOpFp16Case.__name__ = cls_name
-    globals()[cls_name] = TestMatMulOpFp16Case
-
-
-create_test_fp16_class(TestMatMulOp)
-create_test_fp16_class(TestMatMulOp1)
-create_test_fp16_class(TestMatMulOp2)
-create_test_fp16_class(TestMatMulOp3)
-create_test_fp16_class(TestMatMulOp4)
-create_test_fp16_class(TestMatMulOp5)
-create_test_fp16_class(TestMatMulOp6)
-create_test_fp16_class(TestMatMulOp9)
-create_test_fp16_class(TestMatMulOp10)
-create_test_fp16_class(TestMatMulOp11)
-create_test_fp16_class(TestMatMulOp12)
-create_test_fp16_class(TestMatMulOp13)
-
-if __name__ == "__main__":
-    unittest.main()

From d113018e9befab1540aa21ee5d6f8261831e245d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 23 Sep 2025 19:12:06 +0800
Subject: [PATCH 44/95] [metax]fix paddle bug" (#58)

* [metax]fix paddle bug
---
 backends/metax_gpu/CMakeLists.txt             |   2 -
 .../grid_sample_grad_kernel_register.cu       |  23 -
 .../grid_sample_kernel_register.cu            |  19 -
 .../grid_sample_grad_kernel_register.cu       | 839 ++++++++++++++++++
 .../grid_sample_kernel_register.cu            | 527 +++++++++++
 .../metax_kernel/weight_only_linear_kernel.cu |   3 +-
 6 files changed, 1368 insertions(+), 45 deletions(-)
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index b98f2bcc919..bca1ce7aad4 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -310,8 +310,6 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/hinge_loss_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/hinge_loss_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gru_grad_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/grid_sample_grad_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/grid_sample_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/generate_proposals_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gaussian_inplace_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gammaln_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu
deleted file mode 100644
index 83c47dc86db..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_grad_kernel_register.cu
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
-
-PD_CUSTOM_KERNEL_REGISTER(grid_sample_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::GridSampleGradKernel,
-                          float,
-                          double) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu
deleted file mode 100644
index a0447405971..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/grid_sample_kernel_register.cu
+++ /dev/null
@@ -1,19 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/grid_sample_kernel.h"
-
-PD_CUSTOM_KERNEL_REGISTER(
-    grid_sample, metax_gpu, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu
new file mode 100644
index 00000000000..8aae95bdb22
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/grid_sample_grad_kernel_register.cu
@@ -0,0 +1,839 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kernels/metax_kernel/metax_context.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/grid_sample_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ void AtomicAdd(T* data,
+                                                 IndexT h,
+                                                 IndexT w,
+                                                 IndexT sH,
+                                                 IndexT sW,
+                                                 IndexT H,
+                                                 IndexT W,
+                                                 T delta) {
+  if (InBounds(h, w, H, W)) {
+    phi::CudaAtomicAdd(data + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ void AtomicAdd3D(T* data,
+                                                   IndexT d,
+                                                   IndexT h,
+                                                   IndexT w,
+                                                   IndexT sD,
+                                                   IndexT sH,
+                                                   IndexT sW,
+                                                   IndexT D,
+                                                   IndexT H,
+                                                   IndexT W,
+                                                   T delta) {
+  if (InBounds3D(d, h, w, D, H, W)) {
+    phi::CudaAtomicAdd(data + d * sD + h * sH + w * sW, delta);
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T
+UnnormalizeWithMask(T coord, IndexT size, bool align_corners, T* grad_in) {
+  if (align_corners) {
+    *grad_in = static_cast<T>(size - 1) / 2;
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    *grad_in = static_cast<T>(size) / 2;
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ClipIndexesWithMask(T in,
+                                                        IndexT clip_limit,
+                                                        T* grad_in) {
+  if (in <= static_cast<T>(0)) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  } else {
+    T max = static_cast<T>(clip_limit - 1);
+    if (in >= max) {
+      *grad_in = static_cast<T>(0);
+      return max;
+    } else {
+      *grad_in = static_cast<T>(1);
+      return in;
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T
+ReflectIndexesWithMask(T in, IndexT twice_low, IndexT twice_high, T* grad_in) {
+  if (twice_low == twice_high) {
+    *grad_in = static_cast<T>(0);
+    return static_cast<T>(0);
+  }
+  IndexT grad_in_mult_;
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = in - min;
+  if (in < static_cast<T>(0)) {
+    grad_in_mult_ = -1;
+    in = -in;
+  } else {
+    grad_in_mult_ = 1;
+  }
+  T extra = fmod(in, span);
+  IndexT flips = static_cast<IndexT>(floor(in / span));
+  if (flips % 2 == 0) {
+    *grad_in = static_cast<T>(grad_in_mult_);
+    return extra + min;
+  } else {
+    *grad_in = static_cast<T>(-grad_in_mult_);
+    return span - extra + min;
+  }
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T
+ComputePositionsWithMask(T coord,
+                         IndexT size,
+                         PaddingMode padding_mode,
+                         bool align_corners,
+                         T* grad_in) {
+  T grad_clip, grad_refl;
+  coord = UnnormalizeWithMask<T>(coord, size, align_corners, grad_in);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_clip;
+  } else if (padding_mode == PaddingMode::reflect) {
+    coord = align_corners ? ReflectIndexesWithMask<T, IndexT>(
+                                coord, 0, 2 * (size - 1), &grad_refl)
+                          : ReflectIndexesWithMask<T, IndexT>(
+                                coord, -1, 2 * size - 1, &grad_refl);
+    coord = ClipIndexesWithMask(coord, size, &grad_clip);
+    *grad_in = (*grad_in) * grad_refl * grad_clip;
+  }
+  return SafeDownGradeToIntRange(coord);
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSamplerCudaBackwardKernel(const IndexT nthreads,
+                                              const T* grad_output,
+                                              const T* input,
+                                              const T* grid,
+                                              IndexT n,
+                                              IndexT out_c,
+                                              IndexT out_h,
+                                              IndexT out_w,
+                                              IndexT in_h,
+                                              IndexT in_w,
+                                              T* grad_input,
+                                              T* grad_grid,
+                                              const Mode mode,
+                                              const PaddingMode padding_mode,
+                                              bool align_corners) {
+  IndexT inp_sN = out_c * in_h * in_w;
+  IndexT inp_sC = in_h * in_w;
+  IndexT inp_sH = in_w;
+  IndexT inp_sW = 1;
+  IndexT grid_sN = out_h * out_w * 2;
+  IndexT grid_sH = out_w * 2;
+  IndexT grid_sW = 2;
+  IndexT grid_sCoor = 1;
+
+  IndexT gOut_sN = out_c * out_h * out_w;
+  IndexT gOut_sC = out_h * out_w;
+  IndexT gOut_sH = out_w;
+  IndexT gOut_sW = 1;
+
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const IndexT w = index % out_w;
+    const IndexT h = (index / out_w) % out_h;
+    const IndexT n = index / (out_h * out_w);
+    const IndexT grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    T gix_mult, giy_mult;
+    ix = ComputePositionsWithMask<T, IndexT>(
+        ix, in_w, padding_mode, align_corners, &gix_mult);
+    iy = ComputePositionsWithMask<T, IndexT>(
+        iy, in_h, padding_mode, align_corners, &giy_mult);
+
+    if (mode == Mode::bilinear) {
+      IndexT ix_nw = static_cast<IndexT>(floor(ix));
+      IndexT iy_nw = static_cast<IndexT>(floor(iy));
+      IndexT ix_ne = ix_nw + 1;
+      IndexT iy_ne = iy_nw;
+      IndexT ix_sw = ix_nw;
+      IndexT iy_sw = iy_nw + 1;
+      IndexT ix_se = ix_nw + 1;
+      IndexT iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0);
+      IndexT gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      IndexT inp_offset_NC = n * inp_sN;
+      for (IndexT c = 0; c < out_c; ++c,
+                  inp_offset_NC += inp_sC,
+                  gInp_ptr_NC += inp_sC,
+                  gOut_offset += gOut_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        AtomicAdd(
+            gInp_ptr_NC, iy_nw, ix_nw, inp_sH, inp_sW, in_h, in_w, nw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_ne, ix_ne, inp_sH, inp_sW, in_h, in_w, ne * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_sw, ix_sw, inp_sH, inp_sW, in_h, in_w, sw * gOut);
+        AtomicAdd(
+            gInp_ptr_NC, iy_se, ix_se, inp_sH, inp_sW, in_h, in_w, se * gOut);
+
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          T nw_val = input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW];
+          gix -= nw_val * (iy_se - iy) * gOut;
+          giy -= nw_val * (ix_se - ix) * gOut;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          T ne_val = input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW];
+          gix += ne_val * (iy_sw - iy) * gOut;
+          giy -= ne_val * (ix - ix_sw) * gOut;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          T sw_val = input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW];
+          gix -= sw_val * (iy - iy_ne) * gOut;
+          giy += sw_val * (ix_ne - ix) * gOut;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          T se_val = input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW];
+          gix += se_val * (iy - iy_nw) * gOut;
+          giy += se_val * (ix - ix_nw) * gOut;
+        }
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = gix_mult * gix;
+        gGrid_ptr_NHW[1] = giy_mult * giy;
+      }
+    } else if (mode == Mode::nearest) {
+      IndexT ix_nearest = static_cast<IndexT>(std::nearbyint(ix));
+      IndexT iy_nearest = static_cast<IndexT>(std::nearbyint(iy));
+
+      IndexT gOut_offset = n * gOut_sN + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (IndexT c = 0; c < out_c;
+           ++c, gInp_ptr_NC += inp_sC, gOut_offset += gOut_sC) {
+        AtomicAdd(gInp_ptr_NC,
+                  iy_nearest,
+                  ix_nearest,
+                  inp_sH,
+                  inp_sW,
+                  in_h,
+                  in_w,
+                  grad_output[gOut_offset]);
+      }
+
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NHW[0] = static_cast<T>(0);
+        gGrid_ptr_NHW[1] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSampler3DCudaBackwardKernel(const IndexT nthreads,
+                                                const T* grad_output,
+                                                const T* input,
+                                                const T* grid,
+                                                IndexT out_c,
+                                                IndexT out_d,
+                                                IndexT out_h,
+                                                IndexT out_w,
+                                                IndexT in_d,
+                                                IndexT in_h,
+                                                IndexT in_w,
+                                                T* grad_input,
+                                                T* grad_grid,
+                                                const Mode mode,
+                                                const PaddingMode padding_mode,
+                                                bool align_corners) {
+  IndexT inp_sW = 1;
+  IndexT inp_sH = in_w;
+  IndexT inp_sD = in_h * in_w;
+  IndexT inp_sC = in_d * inp_sD;
+  IndexT inp_sN = out_c * inp_sC;
+
+  IndexT grid_sCoor = 1;
+  IndexT grid_sW = 3;
+  IndexT grid_sH = out_w * grid_sW;
+  IndexT grid_sD = out_h * grid_sH;
+  IndexT grid_sN = out_d * grid_sD;
+
+  IndexT gOut_sW = 1;
+  IndexT gOut_sH = out_w;
+  IndexT gOut_sD = out_h * out_w;
+  IndexT gOut_sC = out_d * gOut_sD;
+  IndexT gOut_sN = out_c * gOut_sC;
+
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) {
+    const IndexT w = index % out_w;
+    const IndexT h = (index / out_w) % out_h;
+    const IndexT d = (index / (out_h * out_w)) % out_d;
+    const IndexT n = index / (out_d * out_h * out_w);
+    const auto grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y, z coordinates from grid
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    T iz = grid[grid_offset + 2 * grid_sCoor];
+
+    // multipliers for gradients on ix, iy, and iz
+    T gix_mult, giy_mult, giz_mult;
+    ix = ComputePositionsWithMask(
+        ix, in_w, padding_mode, align_corners, &gix_mult);
+    iy = ComputePositionsWithMask(
+        iy, in_h, padding_mode, align_corners, &giy_mult);
+    iz = ComputePositionsWithMask(
+        iz, in_d, padding_mode, align_corners, &giz_mult);
+
+    if (mode == Mode::bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      IndexT ix_tnw = static_cast<IndexT>(std::floor(ix));
+      IndexT iy_tnw = static_cast<IndexT>(std::floor(iy));
+      IndexT iz_tnw = static_cast<IndexT>(std::floor(iz));
+
+      IndexT ix_tne = ix_tnw + 1;
+      IndexT iy_tne = iy_tnw;
+      IndexT iz_tne = iz_tnw;
+
+      IndexT ix_tsw = ix_tnw;
+      IndexT iy_tsw = iy_tnw + 1;
+      IndexT iz_tsw = iz_tnw;
+
+      IndexT ix_tse = ix_tnw + 1;
+      IndexT iy_tse = iy_tnw + 1;
+      IndexT iz_tse = iz_tnw;
+
+      IndexT ix_bnw = ix_tnw;
+      IndexT iy_bnw = iy_tnw;
+      IndexT iz_bnw = iz_tnw + 1;
+
+      IndexT ix_bne = ix_tnw + 1;
+      IndexT iy_bne = iy_tnw;
+      IndexT iz_bne = iz_tnw + 1;
+
+      IndexT ix_bsw = ix_tnw;
+      IndexT iy_bsw = iy_tnw + 1;
+      IndexT iz_bsw = iz_tnw + 1;
+
+      IndexT ix_bse = ix_tnw + 1;
+      IndexT iy_bse = iy_tnw + 1;
+      IndexT iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      T gix = static_cast<T>(0), giy = static_cast<T>(0),
+        giz = static_cast<T>(0);
+      IndexT gOut_offset =
+          n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+      IndexT inp_offset_NC = n * inp_sN;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (IndexT c = 0; c < out_c; ++c,
+                  gOut_offset += gOut_sC,
+                  gInp_ptr_NC += inp_sC,
+                  inp_offset_NC += inp_sC) {
+        T gOut = grad_output[gOut_offset];
+
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tnw,
+                    iy_tnw,
+                    ix_tnw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tnw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tne,
+                    iy_tne,
+                    ix_tne,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tne * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tsw,
+                    iy_tsw,
+                    ix_tsw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tsw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_tse,
+                    iy_tse,
+                    ix_tse,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    tse * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bnw,
+                    iy_bnw,
+                    ix_bnw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bnw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bne,
+                    iy_bne,
+                    ix_bne,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bne * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bsw,
+                    iy_bsw,
+                    ix_bsw,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bsw * gOut);
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_bse,
+                    iy_bse,
+                    ix_bse,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    bse * gOut);
+
+        // calculate grad_grid
+        if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
+          T tnw_val = input[inp_offset_NC + iz_tnw * inp_sD + iy_tnw * inp_sH +
+                            ix_tnw * inp_sW];
+          gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut;
+          giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut;
+          giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut;
+        }
+        if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
+          T tne_val = input[inp_offset_NC + iz_tne * inp_sD + iy_tne * inp_sH +
+                            ix_tne * inp_sW];
+          gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut;
+          giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut;
+          giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut;
+        }
+        if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
+          T tsw_val = input[inp_offset_NC + iz_tsw * inp_sD + iy_tsw * inp_sH +
+                            ix_tsw * inp_sW];
+          gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut;
+          giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut;
+          giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut;
+        }
+        if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
+          T tse_val = input[inp_offset_NC + iz_tse * inp_sD + iy_tse * inp_sH +
+                            ix_tse * inp_sW];
+          gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut;
+          giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut;
+          giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut;
+        }
+        if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
+          T bnw_val = input[inp_offset_NC + iz_bnw * inp_sD + iy_bnw * inp_sH +
+                            ix_bnw * inp_sW];
+          gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut;
+          giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut;
+          giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut;
+        }
+        if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
+          T bne_val = input[inp_offset_NC + iz_bne * inp_sD + iy_bne * inp_sH +
+                            ix_bne * inp_sW];
+          gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut;
+          giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut;
+          giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut;
+        }
+        if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
+          T bsw_val = input[inp_offset_NC + iz_bsw * inp_sD + iy_bsw * inp_sH +
+                            ix_bsw * inp_sW];
+          gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut;
+          giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut;
+          giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut;
+        }
+        if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
+          T bse_val = input[inp_offset_NC + iz_bse * inp_sD + iy_bse * inp_sH +
+                            ix_bse * inp_sW];
+          gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut;
+          giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut;
+          giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut;
+        }
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NDHW[0] = gix_mult * gix;
+        gGrid_ptr_NDHW[1] = giy_mult * giy;
+        gGrid_ptr_NDHW[2] = giz_mult * giz;
+      }
+    } else if (mode == Mode::nearest) {
+      IndexT ix_nearest = static_cast<IndexT>(std::round(ix));
+      IndexT iy_nearest = static_cast<IndexT>(std::round(iy));
+      IndexT iz_nearest = static_cast<IndexT>(std::round(iz));
+
+      // assign nearest neighbor pixel value to output pixel
+      IndexT gOut_offset =
+          n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
+      T* gInp_ptr_NC = grad_input + n * inp_sN;
+      for (IndexT c = 0; c < out_c;
+           ++c, gOut_offset += gOut_sC, gInp_ptr_NC += inp_sC) {
+        AtomicAdd3D(gInp_ptr_NC,
+                    iz_nearest,
+                    iy_nearest,
+                    ix_nearest,
+                    inp_sD,
+                    inp_sH,
+                    inp_sW,
+                    in_d,
+                    in_h,
+                    in_w,
+                    grad_output[gOut_offset]);
+      }
+      if (grad_grid != nullptr) {
+        T* gGrid_ptr_NDHW = grad_grid + index * grid_sW;
+        gGrid_ptr_NDHW[0] = static_cast<T>(0);
+        gGrid_ptr_NDHW[1] = static_cast<T>(0);
+        gGrid_ptr_NDHW[2] = static_cast<T>(0);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleGradKernel(const Context& dev_ctx,
+                          const DenseTensor& x,
+                          const DenseTensor& grid,
+                          const DenseTensor& out_grad,
+                          const std::string& mode,
+                          const std::string& padding_mode,
+                          bool align_corners,
+                          DenseTensor* x_grad,
+                          DenseTensor* grid_grad) {
+  if (out_grad.numel() == 0) {
+    if (x_grad) {
+      phi::Full<T, Context>(
+          dev_ctx, phi::IntArray(common::vectorize(x_grad->dims())), 0, x_grad);
+    }
+    if (grid_grad) {
+      phi::Full<T, Context>(dev_ctx,
+                            phi::IntArray(common::vectorize(grid_grad->dims())),
+                            0,
+                            grid_grad);
+    }
+    return;
+  }
+
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+#ifndef PADDLE_WITH_HIP
+  if (condCudnnGridSampler<T>(x, grid) &&
+      enum_padding_mode == PaddingMode::zeros && enum_mode == Mode::bilinear &&
+      align_corners) {
+    const int64_t N = x.dims()[0];
+    const int64_t C = x.dims()[1];
+    const int64_t H_in = x.dims()[2];
+    const int64_t W_in = x.dims()[3];
+    const int64_t H_out = grid.dims()[1];
+    const int64_t W_out = grid.dims()[2];
+
+    // cuDNN handle
+    cudnnHandle_t handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+
+    // Create and set Tensor descriptors (NCHW) for x/y
+    cudnnTensorDescriptor_t x_desc, dx_desc, y_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&dx_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&y_desc));
+
+    const cudnnDataType_t cudnn_dtype =
+        std::is_same<T, float>::value ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(x_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_in),
+                                                 static_cast<int>(W_in)));
+
+    // The shape of dx is consistent with that of x
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(dx_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_in),
+                                                 static_cast<int>(W_in)));
+
+    // The shape of y is consistent with out_grad
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(y_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_out),
+                                                 static_cast<int>(W_out)));
+
+    // Spatial Transformer descriptor: specifies sampler type and output
+    // dimension (N, C, H_out, W_out)
+    cudnnSpatialTransformerDescriptor_t st_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateSpatialTransformerDescriptor(&st_desc));
+    int st_dims[4] = {static_cast<int>(N),
+                      static_cast<int>(C),
+                      static_cast<int>(H_out),
+                      static_cast<int>(W_out)};
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetSpatialTransformerNdDescriptor(
+            st_desc, CUDNN_SAMPLER_BILINEAR, cudnn_dtype, 4, st_dims));
+
+    // data pointer
+    const T* x_data = x.data<T>();
+    const T* grid_data = grid.data<T>();
+    const T* dy_data = out_grad.data<T>();
+
+    T* dx_data = dev_ctx.template Alloc<T>(x_grad);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+    T* dgrid_data = nullptr;
+    if (grid_grad) {
+      dgrid_data = dev_ctx.template Alloc<T>(grid_grad);
+    }
+
+    // alpha/beta
+    using AlphaBetaT = typename std::
+        conditional<std::is_same<T, float>::value, float, double>::type;
+    const AlphaBetaT one = static_cast<AlphaBetaT>(1.0);
+    const AlphaBetaT zero = static_cast<AlphaBetaT>(0.0);
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfSamplerBackward(
+        handle,
+        st_desc,
+        static_cast<const void*>(&one),  // alpha (for dx)
+        x_desc,
+        static_cast<const void*>(x_data),
+        static_cast<const void*>(&zero),  // beta (for dx)
+        dx_desc,
+        static_cast<void*>(dx_data),
+        static_cast<const void*>(&one),  // alpha (for dgrid)
+        y_desc,
+        static_cast<const void*>(dy_data),
+        static_cast<const void*>(grid_data),
+        static_cast<const void*>(&zero),  // beta (for dgrid)
+        static_cast<void*>(dgrid_data)));
+
+    // resource release
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroySpatialTransformerDescriptor(st_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(dx_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(y_desc));
+    return;
+  }
+#endif
+
+  bool use_int32_index = x.numel() <= std::numeric_limits<int>::max() &&
+                         grid.numel() <= std::numeric_limits<int>::max() &&
+                         out_grad.numel() <= std::numeric_limits<int>::max();
+
+  if (x.dims().size() == 4) {
+    const int64_t n = grid.dims()[0];
+    const int64_t out_h = grid.dims()[1];
+    const int64_t out_w = grid.dims()[2];
+    const int64_t c = x.dims()[1];
+    const int64_t in_h = x.dims()[2];
+    const int64_t in_w = x.dims()[3];
+
+    dev_ctx.template Alloc<T>(x_grad);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+    T* grid_grad_data = nullptr;
+    if (grid_grad != nullptr) {
+      grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+    }
+
+    int64_t count = n * out_h * out_w;
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSamplerCudaBackwardKernel<T, INDEX_TYPE>                            \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          count,                                                          \
+          out_grad.data<T>(),                                             \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          n,                                                              \
+          c,                                                              \
+          out_h,                                                          \
+          out_w,                                                          \
+          in_h,                                                           \
+          in_w,                                                           \
+          x_grad->data<T>(),                                              \
+          grid_grad_data,                                                 \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners);
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int32_t)
+    } else {
+      LAUNCH_KERNEL(int64_t)
+    }
+#undef LAUNCH_KERNEL
+  } else {
+    const int64_t out_d = grid.dims()[1];
+    const int64_t out_h = grid.dims()[2];
+    const int64_t out_w = grid.dims()[3];
+    const int64_t n = x.dims()[0];
+    const int64_t c = x.dims()[1];
+    const int64_t in_d = x.dims()[2];
+    const int64_t in_h = x.dims()[3];
+    const int64_t in_w = x.dims()[4];
+
+    dev_ctx.template Alloc<T>(x_grad);
+    phi::funcs::SetConstant<Context, T>()(dev_ctx, x_grad, static_cast<T>(0));
+
+    T* grid_grad_data = nullptr;
+    if (grid_grad != nullptr) {
+      grid_grad_data = dev_ctx.template Alloc<T>(grid_grad);
+    }
+
+    int64_t count = static_cast<int64_t>(n * out_d * out_h * out_w);
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSampler3DCudaBackwardKernel<T, INDEX_TYPE>                          \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          count,                                                          \
+          out_grad.data<T>(),                                             \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          c,                                                              \
+          out_d,                                                          \
+          out_h,                                                          \
+          out_w,                                                          \
+          in_d,                                                           \
+          in_h,                                                           \
+          in_w,                                                           \
+          x_grad->data<T>(),                                              \
+          grid_grad_data,                                                 \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners);
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int32_t)
+    } else {
+      LAUNCH_KERNEL(int64_t)
+    }
+#undef LAUNCH_KERNEL
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(grid_sample_grad,
+                          metax_gpus,
+                          ALL_LAYOUT,
+                          phi::GridSampleGradKernel,
+                          float,
+                          double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu
new file mode 100644
index 00000000000..71050c264c6
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/grid_sample_kernel_register.cu
@@ -0,0 +1,527 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_kernel/metax_context.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gpu/grid_sample_utils.h"
+#include "paddle/phi/kernels/grid_sample_kernel.h"
+
+namespace phi {
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T Unnormalize(T coord,
+                                                IndexT size,
+                                                bool align_corners) {
+  return align_corners ? ((coord + 1.f) / 2) * (size - 1)
+                       : ((coord + 1.f) * size - 1) / 2;
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ClipIndexes(T in, IndexT max_value) {
+  return min(static_cast<T>(max_value - 1), max(in, static_cast<T>(0)));
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ReflectIndexes(T in,
+                                                   IndexT twice_low,
+                                                   IndexT twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<T>(0);
+  }
+  T min = static_cast<T>(twice_low) / 2;
+  T span = static_cast<T>(twice_high - twice_low) / 2;
+  in = fabs(in - min);
+  T extra = fmod(in, span);
+  IndexT flips = floor(in / span);
+  return (flips & 1) ? span - extra + min : extra + min;  // cond ? odd : even
+}
+
+template <typename T, typename IndexT>
+static __forceinline__ __device__ T ComputePositions(T coord,
+                                                     IndexT size,
+                                                     PaddingMode padding_mode,
+                                                     bool align_corners) {
+  coord = Unnormalize(coord, size, align_corners);
+  if (padding_mode == PaddingMode::border) {
+    coord = ClipIndexes(coord, size);
+  } else if (padding_mode == PaddingMode::reflect) {
+    coord = align_corners ? ReflectIndexes<T, IndexT>(coord, 0, 2 * (size - 1))
+                          : ReflectIndexes<T, IndexT>(coord, -1, 2 * size - 1);
+    coord = ClipIndexes(coord, size);
+  }
+  return SafeDownGradeToIntRange(coord);
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSampleCudaKernel(IndexT n,
+                                     IndexT out_c,
+                                     IndexT out_hw,
+                                     IndexT in_h,
+                                     IndexT in_w,
+                                     const T* __restrict__ input,
+                                     const T* __restrict__ grid,
+                                     T* __restrict__ output,
+                                     const Mode mode,
+                                     const PaddingMode padding_mode,
+                                     bool align_corners) {
+  IndexT nthreads = n * out_hw;
+  IndexT inp_sN = out_c * (in_h * in_w);
+  IndexT inp_sC = in_h * in_w;
+  IndexT inp_sH = in_w;
+  IndexT inp_sW = 1;
+  IndexT grid_sNHW = 2;
+  IndexT grid_sCoor = 1;
+  IndexT out_sN = out_c * out_hw;
+  IndexT out_sC = out_hw;
+  IndexT out_sHW = 1;
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) {
+    const IndexT hw = index % out_hw;
+    const IndexT n = index / out_hw;
+    const IndexT grid_offset = index * grid_sNHW;
+
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    if (mode == Mode::bilinear) {
+      IndexT ix_nw = floor(ix);
+      IndexT iy_nw = floor(iy);
+      IndexT ix_ne = ix_nw + 1;
+      IndexT iy_ne = iy_nw;
+      IndexT ix_sw = ix_nw;
+      IndexT iy_sw = iy_nw + 1;
+      IndexT ix_se = ix_nw + 1;
+      IndexT iy_se = iy_nw + 1;
+
+      T nw = (ix_se - ix) * (iy_se - iy);
+      T ne = (ix - ix_sw) * (iy_sw - iy);
+      T sw = (ix_ne - ix) * (iy - iy_ne);
+      T se = (ix - ix_nw) * (iy - iy_nw);
+
+      IndexT inp_offset_NC = n * inp_sN;
+      T* out_ptr_NCHW = output + (n * out_sN + hw * out_sHW);
+
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        T value{0};
+        if (InBounds(iy_nw, ix_nw, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (InBounds(iy_ne, ix_ne, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (InBounds(iy_sw, ix_sw, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (InBounds(iy_se, ix_se, in_h, in_w)) {
+          value += input[inp_offset_NC + iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+        *out_ptr_NCHW = value;
+      }
+    } else if (mode == Mode::nearest) {
+      IndexT ix_nearest = std::nearbyint(ix);
+      IndexT iy_nearest = std::nearbyint(iy);
+      IndexT inp_offset_NC = n * inp_sN;
+      T* out_ptr_NCHW = output + (n * out_sN + hw * out_sHW);
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_offset_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (InBounds(iy_nearest, ix_nearest, in_h, in_w)) {
+          *out_ptr_NCHW =
+              input[inp_offset_NC + iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void GridSample3DCudaKernel(const IndexT nthreads,
+                                       IndexT out_c,
+                                       IndexT out_d,
+                                       IndexT out_h,
+                                       IndexT out_w,
+                                       IndexT in_d,
+                                       IndexT in_h,
+                                       IndexT in_w,
+                                       const T* input,
+                                       const T* grid,
+                                       T* output,
+                                       const Mode interpolation_mode,
+                                       const PaddingMode padding_mode,
+                                       bool align_corners) {
+  IndexT inp_sW = 1;
+  IndexT inp_sH = in_w;
+  IndexT inp_sD = in_h * in_w;
+  IndexT inp_sC = in_d * inp_sD;
+  IndexT inp_sN = out_c * inp_sC;
+
+  IndexT grid_sCoor = 1;
+  IndexT grid_sW = 3;
+  IndexT grid_sH = out_w * grid_sW;
+  IndexT grid_sD = out_h * grid_sH;
+  IndexT grid_sN = out_d * grid_sD;
+
+  IndexT out_sW = 1;
+  IndexT out_sH = out_w;
+  IndexT out_sD = out_h * out_w;
+  IndexT out_sC = out_d * out_sD;
+  IndexT out_sN = out_c * out_sC;
+
+  CUDA_KERNEL_LOOP_TYPE(index, nthreads, IndexT) {
+    const IndexT w = index % out_w;
+    const IndexT h = (index / out_w) % out_h;
+    const IndexT d = (index / (out_h * out_w)) % out_d;
+    const IndexT n = index / (out_d * out_h * out_w);
+    const IndexT grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+    // get the corresponding input x, y, z coordinates from grid
+    T ix = grid[grid_offset];
+    T iy = grid[grid_offset + grid_sCoor];
+    T iz = grid[grid_offset + 2 * grid_sCoor];
+    ix = ComputePositions(ix, in_w, padding_mode, align_corners);
+    iy = ComputePositions(iy, in_h, padding_mode, align_corners);
+    iz = ComputePositions(iz, in_d, padding_mode, align_corners);
+    if (interpolation_mode == Mode::bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      IndexT ix_tnw = static_cast<IndexT>(std::floor(ix));
+      IndexT iy_tnw = static_cast<IndexT>(std::floor(iy));
+      IndexT iz_tnw = static_cast<IndexT>(std::floor(iz));
+
+      IndexT ix_tne = ix_tnw + 1;
+      IndexT iy_tne = iy_tnw;
+      IndexT iz_tne = iz_tnw;
+
+      IndexT ix_tsw = ix_tnw;
+      IndexT iy_tsw = iy_tnw + 1;
+      IndexT iz_tsw = iz_tnw;
+
+      IndexT ix_tse = ix_tnw + 1;
+      IndexT iy_tse = iy_tnw + 1;
+      IndexT iz_tse = iz_tnw;
+
+      IndexT ix_bnw = ix_tnw;
+      IndexT iy_bnw = iy_tnw;
+      IndexT iz_bnw = iz_tnw + 1;
+
+      IndexT ix_bne = ix_tnw + 1;
+      IndexT iy_bne = iy_tnw;
+      IndexT iz_bne = iz_tnw + 1;
+
+      IndexT ix_bsw = ix_tnw;
+      IndexT iy_bsw = iy_tnw + 1;
+      IndexT iz_bsw = iz_tnw + 1;
+
+      IndexT ix_bse = ix_tnw + 1;
+      IndexT iy_bse = iy_tnw + 1;
+      IndexT iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      T tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      T tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      T tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      T tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      T bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      T bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      T bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      T bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      const T* inp_ptr_NC = input + n * inp_sN;
+      T* out_ptr_NCDHW =
+          output + (n * out_sN + d * out_sD + h * out_sH + w * out_sW);
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        *out_ptr_NCDHW = static_cast<T>(0);
+        if (InBounds3D(iz_tnw, iy_tnw, ix_tnw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] *
+              tnw;
+        }
+        if (InBounds3D(iz_tne, iy_tne, ix_tne, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] *
+              tne;
+        }
+        if (InBounds3D(iz_tsw, iy_tsw, ix_tsw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] *
+              tsw;
+        }
+        if (InBounds3D(iz_tse, iy_tse, ix_tse, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] *
+              tse;
+        }
+        if (InBounds3D(iz_bnw, iy_bnw, ix_bnw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] *
+              bnw;
+        }
+        if (InBounds3D(iz_bne, iy_bne, ix_bne, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] *
+              bne;
+        }
+        if (InBounds3D(iz_bsw, iy_bsw, ix_bsw, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] *
+              bsw;
+        }
+        if (InBounds3D(iz_bse, iy_bse, ix_bse, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] *
+              bse;
+        }
+      }
+    } else if (interpolation_mode == Mode::nearest) {
+      IndexT ix_nearest = static_cast<IndexT>(std::nearbyint(ix));
+      IndexT iy_nearest = static_cast<IndexT>(std::nearbyint(iy));
+      IndexT iz_nearest = static_cast<IndexT>(std::nearbyint(iz));
+
+      // assign nearest neighbor pixel value to output pixel
+      const T* inp_ptr_NC = input + n * inp_sN;
+      T* out_ptr_NCDHW =
+          output + (n * out_sN + d * out_sD + h * out_sH + w * out_sW);
+      for (IndexT c = 0; c < out_c;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        if (InBounds3D(iz_nearest, iy_nearest, ix_nearest, in_d, in_h, in_w)) {
+          *out_ptr_NCDHW =
+              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH +
+                         ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCDHW = static_cast<T>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GridSampleKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& grid,
+                      const std::string& mode,
+                      const std::string& padding_mode,
+                      bool align_corners,
+                      DenseTensor* out) {
+  if (out && out->numel() == 0) {
+    dev_ctx.template Alloc<T>(out);
+    return;
+  }
+  PaddingMode enum_padding_mode;
+  Mode enum_mode;
+  if (padding_mode == "border") {
+    enum_padding_mode = PaddingMode::border;
+  } else if (padding_mode == "reflection") {
+    enum_padding_mode = PaddingMode::reflect;
+  } else {
+    enum_padding_mode = PaddingMode::zeros;
+  }
+
+  if (mode == "nearest") {
+    enum_mode = Mode::nearest;
+  } else {
+    enum_mode = Mode::bilinear;
+  }
+
+#ifndef PADDLE_WITH_HIP
+  if (condCudnnGridSampler<T>(x, grid) &&
+      enum_padding_mode == PaddingMode::zeros && enum_mode == Mode::bilinear &&
+      align_corners) {
+    const int64_t N = x.dims()[0];
+    const int64_t C = x.dims()[1];
+    const int64_t H_in = x.dims()[2];
+    const int64_t W_in = x.dims()[3];
+    const int64_t H_out = grid.dims()[1];
+    const int64_t W_out = grid.dims()[2];
+
+    out->Resize({N, C, H_out, W_out});
+    auto* out_data = dev_ctx.template Alloc<T>(out);
+
+    cudnnHandle_t handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+
+    // Create and set Tensor descriptors (NCHW) for x and out
+    cudnnTensorDescriptor_t x_desc, y_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateTensorDescriptor(&y_desc));
+
+    const cudnnDataType_t cudnn_dtype =
+        std::is_same<T, float>::value ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(x_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_in),
+                                                 static_cast<int>(W_in)));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetTensor4dDescriptor(y_desc,
+                                                 CUDNN_TENSOR_NCHW,
+                                                 cudnn_dtype,
+                                                 static_cast<int>(N),
+                                                 static_cast<int>(C),
+                                                 static_cast<int>(H_out),
+                                                 static_cast<int>(W_out)));
+
+    // Spatial Transformer descriptor: specifies sampler type and output
+    // dimension (N, C, H_out, W_out)
+    cudnnSpatialTransformerDescriptor_t st_desc;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnCreateSpatialTransformerDescriptor(&st_desc));
+    int st_dims[4] = {static_cast<int>(N),
+                      static_cast<int>(C),
+                      static_cast<int>(H_out),
+                      static_cast<int>(W_out)};
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnSetSpatialTransformerNdDescriptor(
+            st_desc, CUDNN_SAMPLER_BILINEAR, cudnn_dtype, 4, st_dims));
+
+    const T* x_data = x.data<T>();
+    const T* grid_data = grid.data<T>();
+    using AlphaBetaT = typename std::
+        conditional<std::is_same<T, float>::value, float, double>::type;
+    const AlphaBetaT alpha = static_cast<AlphaBetaT>(1.0);
+    const AlphaBetaT beta = static_cast<AlphaBetaT>(0.0);
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSpatialTfSamplerForward(
+        handle,
+        st_desc,
+        static_cast<const void*>(&alpha),
+        x_desc,
+        static_cast<const void*>(x_data),
+        static_cast<const void*>(grid_data),
+        static_cast<const void*>(&beta),
+        y_desc,
+        static_cast<void*>(out_data)));
+
+    // resource release
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroySpatialTransformerDescriptor(st_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(x_desc));
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnDestroyTensorDescriptor(y_desc));
+    return;
+  }
+#endif
+
+  bool use_int32_index = x.numel() <= std::numeric_limits<int>::max() &&
+                         grid.numel() <= std::numeric_limits<int>::max() &&
+                         out->numel() <= std::numeric_limits<int>::max();
+
+  if (x.dims().size() == 4) {
+    const int64_t n = grid.dims()[0];
+    const int64_t out_h = grid.dims()[1];
+    const int64_t out_w = grid.dims()[2];
+    const int64_t c = x.dims()[1];
+    const int64_t in_h = x.dims()[2];
+    const int64_t in_w = x.dims()[3];
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_h: " << out_h
+            << "; out_w: " << out_w;
+
+    auto* output_data = dev_ctx.template Alloc<T>(out);
+    VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+            << out->dims()[2] << "; " << out->dims()[3];
+
+    int64_t count = n * out_h * out_w;
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSampleCudaKernel<T, INDEX_TYPE>                                     \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          n,                                                              \
+          c,                                                              \
+          out_h * out_w,                                                  \
+          in_h,                                                           \
+          in_w,                                                           \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          output_data,                                                    \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners)
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int);
+    } else {
+      LAUNCH_KERNEL(int64_t);
+    }
+#undef LAUNCH_KERNEL
+  } else {
+    const int64_t n = grid.dims()[0];
+    const int64_t out_d = grid.dims()[1];
+    const int64_t out_h = grid.dims()[2];
+    const int64_t out_w = grid.dims()[3];
+    const int64_t c = x.dims()[1];
+    const int64_t in_d = x.dims()[2];
+    const int64_t in_h = x.dims()[3];
+    const int64_t in_w = x.dims()[4];
+
+    VLOG(3) << "n: " << n << "; c: " << c << "; out_d: " << out_d
+            << "; out_h: " << out_h << "; out_w: " << out_w;
+
+    auto* output_data = dev_ctx.template Alloc<T>(out);
+    VLOG(3) << "out dims: " << out->dims()[0] << "; " << out->dims()[1] << "; "
+            << out->dims()[2] << "; " << out->dims()[3] << "; "
+            << out->dims()[4];
+
+    int64_t count = n * out_d * out_h * out_w;
+    auto cu_stream = dev_ctx.stream();
+    backends::gpu::GpuLaunchConfig config =
+        backends::gpu::GetGpuLaunchConfig1D(dev_ctx, count);
+
+#define LAUNCH_KERNEL(INDEX_TYPE)                                         \
+  GridSample3DCudaKernel<T, INDEX_TYPE>                                   \
+      <<<config.block_per_grid, config.thread_per_block, 0, cu_stream>>>( \
+          count,                                                          \
+          c,                                                              \
+          out_d,                                                          \
+          out_h,                                                          \
+          out_w,                                                          \
+          in_d,                                                           \
+          in_h,                                                           \
+          in_w,                                                           \
+          x.data<T>(),                                                    \
+          grid.data<T>(),                                                 \
+          output_data,                                                    \
+          enum_mode,                                                      \
+          enum_padding_mode,                                              \
+          align_corners)
+    if (use_int32_index) {
+      LAUNCH_KERNEL(int);
+    } else {
+      LAUNCH_KERNEL(int64_t);
+    }
+#undef LAUNCH_KERNEL
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(
+    grid_sample, metax_gpu, ALL_LAYOUT, phi::GridSampleKernel, float, double) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
index eae8c8c0301..d2f39ccf751 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
@@ -35,6 +35,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
                             const int32_t group_size,
                             DenseTensor* out) {
   dev_ctx.template Alloc<T>(out);
+  auto stream = dev_ctx.stream();
   const T* x_data = x.data<T>();
   const int8_t* weight_data = weight.data<int8_t>();
   const T* bias_data = bias ? bias.get().data<T>() : nullptr;
@@ -128,7 +129,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
           k,
           n,
           n};
-      mctlass_op(arguments);
+      mctlass_op(arguments, NULL, stream);
     } else {
       mctlassGemmScaleOp_w8a16_bias mctlass_op;
       typename mctlassGemmScaleOp_w8a16_bias::Arguments arguments{

From 89912995a39f939a582aeb953f761a588c89663d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 23 Sep 2025 20:02:41 +0800
Subject: [PATCH 45/95] =?UTF-8?q?change=E2=80=94ut=20(#59)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* change_ut
---
 backends/metax_gpu/tests/default.txt | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 9f073d7e92f..9c989161fed 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -42,7 +42,6 @@ test_shape_op
 test_tril_triu_op
 test_slice_op
 test_elementwise_add_op
-test_index_put_op
 test_bincount_op
 test_assign_op
 test_logical_op
@@ -73,7 +72,6 @@ test_fractional_max_pool3d_api
 test_nll_loss
 test_is_empty_op
 test_norm_nn_grad
-test_index_fill
 test_floor
 test_slice_scatter
 test_nn_matmul_v2_grad
@@ -127,10 +125,8 @@ test_flip
 test_fused_bias_dropout_residual_layer_norm_op
 test_greater_equal_op
 test_add_op
-test_cartesian_prod
 test_uniform_random_inplace_op
 test_feed_fetch_method
-test_pow_op
 test_conv3d_transpose_op
 test_add_position_encoding_op
 test_imperative_data_loader_base
@@ -223,12 +219,9 @@ test_executor_check_fetch_list
 test_inplace_softmax_with_cross_entropy
 test_cos
 test_imperative_parallel_coalesce_split
-test_grid_sample_function
-test_rnn_decode_api
 test_triu_indices_op
 test_binary_cross_entropy_with_logits_op
 test_mean_op_v1
-test_round_op
 test_assign_pos_op_dygraph
 test_nn_functional_embedding_static
 test_norm_op
@@ -262,7 +255,6 @@ test_diag_v2
 test_complex_transpose
 test_prior_box_op
 test_square_error_cost
-test_fused_rotary_position_embedding
 test_gru_rnn_op
 test_restrict_nonzero
 test_dygraph_weight_norm
@@ -295,7 +287,6 @@ test_argsort_op
 test_layer_norm_op_v2
 test_adaptive_max_pool1d
 test_shard_index_op
-test_cuda_max_memory_allocated
 test_roi_align_op
 test_sin
 test_take

From a770e6f197e8c519712a4a7d2359110d34dc0431 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 23 Sep 2025 20:50:24 +0800
Subject: [PATCH 46/95] change_ut (#60)

* change_ut

---------
---
 backends/metax_gpu/tests/default.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 9c989161fed..21adad68f5b 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -28,7 +28,6 @@ test_one_hot_v2_op
 test_fill_any_op
 test_gather_op
 test_reshape_op
-test_index_put_op
 test_bitwise_op
 test_max_op
 test_pad_op
@@ -214,7 +213,6 @@ test_tile_op
 test_adam_optimizer_fp32_fp64
 test_batch_norm_op
 test_gather_nd_op
-test_pow
 test_executor_check_fetch_list
 test_inplace_softmax_with_cross_entropy
 test_cos

From 902112bb8707edebefa747e4994384df27c3f356 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Wed, 24 Sep 2025 10:05:05 +0800
Subject: [PATCH 47/95] change_ut (#63)

* change_ut

* change_ut

---------
---
 backends/metax_gpu/tests/default.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 21adad68f5b..54f0b7c008f 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -177,7 +177,6 @@ test_imperative_data_parallel
 test_sigmoid
 test_adaptive_max_pool3d
 test_roll_op
-test_index_put_op
 test_assign_op
 test_amp_check_finite_and_scale_op
 test_strided_slice_op

From 61c32baffa5c6711c2962ee35f9bffe270668e1b Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Wed, 24 Sep 2025 16:21:06 +0800
Subject: [PATCH 48/95] [Metax] add keyword filter in CI CMakeLists.txt

---
 backends/metax_gpu/tests/CMakeLists.txt | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index e8b11d347d9..b869ee2b929 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -9,6 +9,8 @@ set(PADDLE_LEGACY_TEST_PATH
     ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test)
 set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
 
+set(NEED_REMOVE_KEYWORDS "attention")
+
 file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py")
 
 if(NOT TEST_LIST_FILE)
@@ -33,6 +35,20 @@ else()
 endif()
 
 foreach(test_name ${TEST_PROGRAMS})
+  set(IS_REMOVE FALSE)
+
+  foreach(keyword ${NEED_REMOVE_KEYWORDS})
+    string(FIND "${test_name}" "${keyword}" RES)
+    if(NOT RES EQUAL -1)
+      set(IS_REMOVE TRUE)
+      break()
+    endif()
+  endforeach()
+
+  if(IS_REMOVE)
+    continue()
+  endif()
+
   set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
   if(NOT EXISTS ${CURRENT_TEST_PROGRAM})
     message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.")

From b2ddc812d2c6851aa3a3e997069c0c0953bbb0a2 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Thu, 25 Sep 2025 15:59:52 +0800
Subject: [PATCH 49/95] [Metax] add ignore case list

---
 backends/metax_gpu/tests/CMakeLists.txt | 46 +++++++------------------
 backends/metax_gpu/tests/ignore.txt     | 21 +++++++++++
 2 files changed, 34 insertions(+), 33 deletions(-)
 create mode 100644 backends/metax_gpu/tests/ignore.txt

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index b869ee2b929..0c84ada4b65 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -60,39 +60,19 @@ endforeach()
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 
 if(NOT TEST_LIST_FILE)
-  list(
-    REMOVE_ITEM
-    PYTHON_TEST_SCRIPTS
-    # Metax unit test
-    ${METAX_UNIT_TEST_PATH}/test_matmul_op_metax.py
-    # 精度问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-    # core.cudnnversion
-    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-    # op_test.py 里 self._get_places()接口的适配问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-    # device == "gpu" 适配问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-    # paddle-gpu 报错一致
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-    # paddle.device.cuda.get_device_properties
-    ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-    # needs check_grad with fp64 precision
-    ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-    # CUDAPinnedPlace 问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+  set(NEED_IGNORE_FILE ${CMAKE_CURRENT_LIST_DIR}/ignore.txt)
+  if(EXISTS ${NEED_IGNORE_FILE})
+    file(STRINGS ${NEED_IGNORE_FILE} NEED_IGNORE_TEST_PROGRAMS)
+    foreach(test_name ${NEED_IGNORE_TEST_PROGRAMS})
+      if(EXISTS ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+        list(REMOVE_ITEM PYTHON_TEST_SCRIPTS
+             ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+      else()
+        list(REMOVE_ITEM PYTHON_TEST_SCRIPTS
+             ${METAX_UNIT_TEST_PATH}/${test_name}.py)
+      endif()
+    endforeach()
+  endif()
 endif()
 
 if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR})
diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
new file mode 100644
index 00000000000..b4f1afbe5b0
--- /dev/null
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -0,0 +1,21 @@
+test_matmul_op_metax
+test_sum_op
+test_max_op
+test_cumsum_op
+test_softmax_with_cross_entropy_op
+test_softmax_op
+test_elementwise_add_op
+test_gather_op
+test_elementwise_pow_op
+test_layer_norm_op
+test_index_add_op
+test_elementwise_div_op
+test_stack_op
+test_logical_op
+test_mean_op
+test_transpose_op
+test_randint_op
+test_uniform_random_op
+test_c_embedding_op
+test_slice_op
+test_compare_op

From cfe44ce24e2e67c595057e0568b7c34f55c08b0a Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 25 Sep 2025 16:04:11 +0800
Subject: [PATCH 50/95] [Metax] add keyword filter in CI CMakeLists.txt (#64)

* [Metax] add keyword filter in CI CMakeLists.txt

* [Metax] add ignore case list
---
 backends/metax_gpu/tests/CMakeLists.txt | 62 ++++++++++++-------------
 backends/metax_gpu/tests/ignore.txt     | 21 +++++++++
 2 files changed, 50 insertions(+), 33 deletions(-)
 create mode 100644 backends/metax_gpu/tests/ignore.txt

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index e8b11d347d9..0c84ada4b65 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -9,6 +9,8 @@ set(PADDLE_LEGACY_TEST_PATH
     ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test)
 set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
 
+set(NEED_REMOVE_KEYWORDS "attention")
+
 file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py")
 
 if(NOT TEST_LIST_FILE)
@@ -33,6 +35,20 @@ else()
 endif()
 
 foreach(test_name ${TEST_PROGRAMS})
+  set(IS_REMOVE FALSE)
+
+  foreach(keyword ${NEED_REMOVE_KEYWORDS})
+    string(FIND "${test_name}" "${keyword}" RES)
+    if(NOT RES EQUAL -1)
+      set(IS_REMOVE TRUE)
+      break()
+    endif()
+  endforeach()
+
+  if(IS_REMOVE)
+    continue()
+  endif()
+
   set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
   if(NOT EXISTS ${CURRENT_TEST_PROGRAM})
     message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.")
@@ -44,39 +60,19 @@ endforeach()
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 
 if(NOT TEST_LIST_FILE)
-  list(
-    REMOVE_ITEM
-    PYTHON_TEST_SCRIPTS
-    # Metax unit test
-    ${METAX_UNIT_TEST_PATH}/test_matmul_op_metax.py
-    # 精度问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-    # core.cudnnversion
-    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-    # op_test.py 里 self._get_places()接口的适配问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-    # device == "gpu" 适配问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-    # paddle-gpu 报错一致
-    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-    # paddle.device.cuda.get_device_properties
-    ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-    # needs check_grad with fp64 precision
-    ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-    # CUDAPinnedPlace 问题
-    ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-    ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+  set(NEED_IGNORE_FILE ${CMAKE_CURRENT_LIST_DIR}/ignore.txt)
+  if(EXISTS ${NEED_IGNORE_FILE})
+    file(STRINGS ${NEED_IGNORE_FILE} NEED_IGNORE_TEST_PROGRAMS)
+    foreach(test_name ${NEED_IGNORE_TEST_PROGRAMS})
+      if(EXISTS ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+        list(REMOVE_ITEM PYTHON_TEST_SCRIPTS
+             ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+      else()
+        list(REMOVE_ITEM PYTHON_TEST_SCRIPTS
+             ${METAX_UNIT_TEST_PATH}/${test_name}.py)
+      endif()
+    endforeach()
+  endif()
 endif()
 
 if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR})
diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
new file mode 100644
index 00000000000..b4f1afbe5b0
--- /dev/null
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -0,0 +1,21 @@
+test_matmul_op_metax
+test_sum_op
+test_max_op
+test_cumsum_op
+test_softmax_with_cross_entropy_op
+test_softmax_op
+test_elementwise_add_op
+test_gather_op
+test_elementwise_pow_op
+test_layer_norm_op
+test_index_add_op
+test_elementwise_div_op
+test_stack_op
+test_logical_op
+test_mean_op
+test_transpose_op
+test_randint_op
+test_uniform_random_op
+test_c_embedding_op
+test_slice_op
+test_compare_op

From 087a9c1240f024210d536e543a2fc55db1175529 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Fri, 26 Sep 2025 14:04:36 +0800
Subject: [PATCH 51/95] [Metax] fix phi::backends::gpu::DnnVersion() symbol not
 found

---
 backends/metax_gpu/patch/paddle.patch   | 216 +++++++++++++-----------
 backends/metax_gpu/tests/CMakeLists.txt |   9 +-
 2 files changed, 122 insertions(+), 103 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index beefb730bf7..8b8ae26dbba 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644
 -  set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
 +  # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
  endif()
- 
+
  set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
 diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
 index bff0f2bf70..9376b5781f 100644
 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc
 +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
 @@ -16,7 +16,7 @@
- 
+
  #include <cstdio>
- 
+
 -#include "paddle/fluid/platform/enforce.h"
 +// #include "paddle/fluid/platform/enforce.h"
  #include "paddle/phi/core/os_info.h"
@@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnDestroyActivationDescriptor);               \
 +  __macro(cudnnSetRNNDescriptor_v6);
  CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- 
+
  #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
@@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnRNNForwardInferenceEx);
  CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
- 
+
 @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
- 
+
 -#if CUDNN_VERSION < 90000
 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
 -  __macro(cudnnGetRNNParamsSize);                     \
@@ -132,7 +132,7 @@ index c0080f0a5e..458ca3e2e8 100644
 -#endif
  }  // namespace dynload
  }  // namespace phi
- 
+
 diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
 index 1547909d92..ef20838434 100644
 --- a/paddle/phi/backends/dynload/cufft.h
@@ -140,7 +140,7 @@ index 1547909d92..ef20838434 100644
 @@ -1,3 +1,4 @@
 +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
          cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
@@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644
 @@ -24,8 +24,8 @@ limitations under the License. */
  #include "paddle/phi/backends/dynload/dynamic_loader.h"
  #include "paddle/phi/common/port.h"
- 
+
 -namespace phi {
 -namespace dynload {
 +// namespace phi {
 +// namespace dynload {
- 
+
  extern std::once_flag cupti_dso_flag;
  extern void *cupti_dso_handle;
 @@ -71,7 +71,7 @@ extern void *cupti_dso_handle;
  CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
- 
+
  #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
 -}  // namespace dynload
 -}  // namespace phi
 +// }  // namespace dynload
 +// }  // namespace phi
- 
+
 -#endif  // PADDLE_WITH_CUPTI
 +#endif  // PADDLE_WITH_CUPTI
 \ No newline at end of file
@@ -232,26 +232,26 @@ index 4ff2e528a9..23f7f4b583 100644
 @@ -1,3 +1,4 @@
 +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -25,7 +26,7 @@ namespace phi {
  namespace backends {
  namespace gpu {
- 
+
 -#define FULL_WARP_MASK 0xFFFFFFFF
 +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
  #define CREATE_SHFL_MASK(mask, predicate) \
    mask = __ballot_sync(FULL_WARP_MASK, (predicate))
- 
+
 @@ -45,12 +46,12 @@ namespace gpu {
- 
+
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
 +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) {
    return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
  }
- 
+
  template <typename T>
 -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
 +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask,
@@ -259,7 +259,7 @@ index 4ff2e528a9..23f7f4b583 100644
                                                  int width = warpSize) {
    return __shfl_xor_sync(mask, val, width);
 @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::float16 val, int delta, int width) {
@@ -267,7 +267,7 @@ index 4ff2e528a9..23f7f4b583 100644
    return phi::dtype::float16(__shfl_down_sync(
        mask, val.to_half(), static_cast<unsigned>(delta), width));
  }
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
@@ -276,7 +276,7 @@ index 4ff2e528a9..23f7f4b583 100644
    return phi::dtype::bfloat16(__shfl_down_sync(
        mask, val.to_nv_bfloat16(), static_cast<unsigned>(delta), width));
 @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
@@ -285,7 +285,7 @@ index 4ff2e528a9..23f7f4b583 100644
        mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
    float imag = static_cast<float>(__shfl_down_sync(
 @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
@@ -294,14 +294,14 @@ index 4ff2e528a9..23f7f4b583 100644
        static_cast<double>(__shfl_down_sync(mask,
                                             static_cast<double>(val.real),
 @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::float16 val, int width) {
 +    unsigned long long mask, phi::dtype::float16 val, int width) {
    return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
  }
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int width) {
@@ -310,7 +310,7 @@ index 4ff2e528a9..23f7f4b583 100644
    return phi::dtype::bfloat16(
        __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
 @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<float> val, int width) {
@@ -319,7 +319,7 @@ index 4ff2e528a9..23f7f4b583 100644
        __shfl_xor_sync(mask, static_cast<float>(val.real), width));
    float imag = static_cast<float>(
 @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<double> val, int width) {
@@ -328,14 +328,14 @@ index 4ff2e528a9..23f7f4b583 100644
        __shfl_xor_sync(mask, static_cast<double>(val.real), width));
    double imag = static_cast<double>(
 @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
- 
+
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
 +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) {
    return __shfl_sync(mask, val, src_line, width);
  }
- 
+
 @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // but most card's warp size is 32.
    const int warpSize = 32;
@@ -343,15 +343,34 @@ index 4ff2e528a9..23f7f4b583 100644
 -  unsigned mask = 0u;
 +  unsigned long long mask = 0ull;
    CREATE_SHFL_MASK(mask, tid < len);
- 
+
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+diff --git a/paddle/phi/backends/gpu/cuda/cuda_info.cc b/paddle/phi/backends/gpu/cuda/cuda_info.cc
+index 99c9eb6ed0..875f1ef38b 100644
+--- a/paddle/phi/backends/gpu/cuda/cuda_info.cc
++++ b/paddle/phi/backends/gpu/cuda/cuda_info.cc
+@@ -25,12 +25,12 @@ static std::vector<phi::gpuDeviceProp> g_device_props;
+
+ namespace phi::backends::gpu {
+
+-#ifndef PADDLE_WITH_CUSTOM_DEVICE
++// #ifndef PADDLE_WITH_CUSTOM_DEVICE
+ int DnnVersion() {
+   if (!dynload::HasCUDNN()) return -1;
+   return dynload::cudnnGetVersion();  // NOLINT
+ }
+-#endif
++// #endif
+
+ static int GetGPUDeviceCountImpl() {
+   int driverVersion = 0;
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
 index 024a7de73e..1e4cdf16be 100644
 --- a/paddle/phi/core/enforce.h
 +++ b/paddle/phi/core/enforce.h
 @@ -45,7 +45,9 @@ limitations under the License. */
  #endif
- 
+
  #ifdef PADDLE_WITH_CUDA
 -#include "paddle/phi/backends/dynload/cublas.h"
 +// #include "paddle/phi/backends/dynload/../../../../../cublas.h"
@@ -361,9 +380,9 @@ index 024a7de73e..1e4cdf16be 100644
  #include "paddle/phi/backends/dynload/curand.h"
  #include "paddle/phi/backends/dynload/cusolver.h"
 @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; }
- 
+
  void ThrowWarnInternal(const std::string& message);
- 
+
 -#if defined(__CUDA_ARCH__)
 +#if defined(__CUDACC__)
  // For cuda, the assertions can affect performance and it is therefore
@@ -379,7 +398,7 @@ index 024a7de73e..1e4cdf16be 100644
    } while (0)
  #elif defined(__HIPCC__)
 @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) {
- 
+
  }  // namespace enforce
  using namespace enforce;  // NOLINT
 -}  // namespace phi
@@ -392,7 +411,7 @@ index c646e487d0..325122175c 100644
 @@ -25,8 +25,9 @@
  #else
  #include <cuda_runtime.h>
- 
+
 -#include "paddle/phi/backends/dynload/cublas.h"
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublas.h"
@@ -400,16 +419,16 @@ index c646e487d0..325122175c 100644
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #include "paddle/phi/backends/dynload/cudnn.h"
  #endif
- 
+
 @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
- 
+
  // TODO(Ming Huang): Since there is no blasLt handler,
  // use rocblas_handle for workaround.
 -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
- 
+
  #undef DECLARE_TYPE_FOR_GPU
- 
+
 diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h
 index 2d02eb370b..8a7233e34e 100644
 --- a/paddle/phi/core/platform/device_context.h
@@ -430,53 +449,53 @@ index d69eb67d6f..1d8b6e9375 100644
 --- a/paddle/phi/kernels/cpu/index_select_impl.h
 +++ b/paddle/phi/kernels/cpu/index_select_impl.h
 @@ -18,7 +18,7 @@
- 
+
  #include "paddle/phi/core/dense_tensor.h"
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
- 
+
  #include "paddle/phi/backends/all_context.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/fc_functor.h"
- 
+
  #include "paddle/phi/backends/gpu/gpu_launch_config.h"
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
 index 88663ec880..98b93072a3 100644
 --- a/paddle/phi/kernels/funcs/gru_compute.cu
 +++ b/paddle/phi/kernels/funcs/gru_compute.cu
 @@ -12,7 +12,7 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/gru_compute.h"
- 
+
  #include "paddle/phi/backends/gpu/gpu_context.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 index 4eae698648..5c047723ea 100644
 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 @@ -43,11 +43,11 @@ template <typename T>
  using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
- 
+
  inline static int GetDesiredBlockDim(int64_t block_dim) {
 -  const int kMaxBlockDim = 512;
 +  const int kMaxBlockDim = 256;
@@ -494,12 +513,12 @@ index 15e1a4a3c3..e4780538d7 100644
 +++ b/paddle/phi/kernels/funcs/math/context_project.h
 @@ -18,7 +18,7 @@
  #include <vector>
- 
+
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/im2col.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
@@ -507,14 +526,14 @@ index e101224970..a52eb6096f 100644
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
  #include "paddle/phi/common/memory_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
- 
+
  namespace phi {
  namespace funcs {
- 
+
 +
 +
  template <typename Context, typename T>
@@ -532,19 +551,19 @@ index 558d363b39..05da04b517 100644
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-index 8b0baf5f5f..260482f124 100644
+index 047f52bd91..a05b34d3ba 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;
- 
+
  #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
- 
+
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..108edda7ca 100644
@@ -553,7 +572,7 @@ index e30d440ff3..108edda7ca 100644
 @@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
- 
+
 -#define FINAL_MASK 0xffffffff
 +#define FINAL_MASK 0xffffffffffffffffull
  #ifdef PADDLE_WITH_HIP
@@ -563,7 +582,7 @@ index e30d440ff3..108edda7ca 100644
 +#define WARP_SIZE 64
  #endif
  #define MAX_NUM_THREADS 1024
- 
+
 @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
@@ -624,7 +643,7 @@ index e30d440ff3..108edda7ca 100644
 +  topk[0 + offset].v = p.v;
 +  topk[0 + offset].id = p.id;
  }
- 
+
  template <typename T, int BlockSize>
 @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
@@ -680,7 +699,7 @@ index e30d440ff3..108edda7ca 100644
 +            // topk + MaxLength - *beam, src, tid, dim, *max, length, largest);
        }
      }
- 
+
 @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
@@ -715,7 +734,7 @@ index e30d440ff3..108edda7ca 100644
 -    if (--(*k) == 0) break;
 +    // if (--(*k) == 0) break;
 +    unsigned long long mask = 0ull;
- 
+
 -    unsigned mask = 0u;
 +    // unsigned mask = 0u;
      CREATE_SHFL_MASK(mask, true);
@@ -739,7 +758,7 @@ index e30d440ff3..108edda7ca 100644
 +
      return ret;
    }
- 
+
    static __device__ __forceinline__ unsigned int SetBitfield(
        unsigned int val, unsigned int to_insert, int pos, int len) {
      unsigned int ret;
@@ -761,7 +780,7 @@ index e30d440ff3..108edda7ca 100644
 +    ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
      return ret;
    }
- 
+
 @@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
@@ -789,7 +808,7 @@ index e30d440ff3..108edda7ca 100644
 +  return ::__lane_id();
 +  // return lane_id;
  }
- 
+
  __device__ __forceinline__ unsigned GetLaneMaskLe() {
    unsigned mask;
 -  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
@@ -798,17 +817,17 @@ index e30d440ff3..108edda7ca 100644
 +  return ((uint64_t(1) << ::__lane_id()) << 1) - 1;
 +  // return mask;
  }
- 
+
  template <typename T, bool KillDependency, class Function>
 @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
- 
+
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
 -  RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  // RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  RadixSearch<T, typename RadixTypeConfig<T>::RadixType, IndexType, false>(
        cur_input, k, num_cols, shared_mem, &kth_value);
- 
+
    __shared__ int64_t block_min_idx;
 @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
@@ -821,12 +840,12 @@ index 32db61532f..0220316bc3 100644
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
  #pragma once
- 
+
  #if defined(PADDLE_WITH_CUDA)
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #endif
- 
+
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 index 9d4bb18d55..ea42cc10a9 100644
@@ -848,12 +867,12 @@ index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 @@ -14,7 +14,7 @@
- 
+
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
- 
+
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
@@ -861,19 +880,19 @@ index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 @@ -14,7 +14,7 @@
- 
+
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
- 
+
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu
-index 4c93778bde..c7bdf8a2cc 100644
+index e7b3d92449..f9403cc5dd 100644
 --- a/paddle/phi/kernels/gpu/correlation_kernel.cu
 +++ b/paddle/phi/kernels/gpu/correlation_kernel.cu
-@@ -103,7 +103,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx,
+@@ -112,7 +112,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx,
                             int stride2,
                             int corr_type_multiply,
                             DenseTensor *out) {
@@ -894,7 +913,7 @@ index f0cca0f701..02ea957240 100644
 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 +#include "kernels/gpudnn/conv_gpudnn.h"
 +#include "kernels/impl/conv_cudnn_impl.h"
- 
+
  namespace phi {
  // To determine use cudnn or not.
 diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu
@@ -934,7 +953,7 @@ index 29fa252e96..4ae72b0935 100644
 +// #endif
    return tanhf(x);
  }
- 
+
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -945,9 +964,9 @@ index 11efd87965..679db14c24 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
- 
+
  namespace phi {
- 
+
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -958,9 +977,9 @@ index 63c35dd4ee..15da9aea45 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
- 
+
  namespace phi {
- 
+
 diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
 index 1bdbe1564c..f753b54bc6 100644
 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -1005,7 +1024,7 @@ index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
- 
+
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_grad_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -1018,14 +1037,14 @@ index 2789cb59a2..b91b076f7f 100644
 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 @@ -20,7 +20,7 @@ limitations under the License. */
- 
+
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
- 
+
 diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
 index 9a21c23666..86413d1577 100644
 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -1050,7 +1069,7 @@ index 4459a931da..837c8682b8 100644
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
 index ad9e9197dd..5478d9817d 100644
@@ -1070,27 +1089,27 @@ index e6b3960f6d..564125f1f6 100644
 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) {
- 
+
  template <typename T>
  HOSTDEVICE T igamc(const T a, const T x) {
 -  static T big = 4.503599627370496e15;
 -  static T biginv = 2.22044604925031308085e-16;
 +  const static T big = 4.503599627370496e15;
 +  const static T biginv = 2.22044604925031308085e-16;
- 
+
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
- 
+
 diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 index 410fb3c560..009ce03440 100644
 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
- 
+
  template <typename T>
  HOSTDEVICE T digamma(T x) {
 -  static T pi = T{3.14159265358979323846};
 +  const static T pi = T{3.14159265358979323846};
- 
+
    if (x == T{0.0}) {
      T inf = std::numeric_limits<T>::infinity();
 diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
@@ -1106,11 +1125,11 @@ index 5ebbc8d2db..c7b6c338e2 100644
 +#include "kernels/funcs/blas/cublaslt.h"
 +#include "kernels/funcs/quant_dequant.h"
 +#include "kernels/metax_kernel/metax_context.h"
- 
+
  #pragma once
- 
+
 @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx,
- 
+
    {
      auto helper =
 -        std::make_unique<CublasLtHelper>(m, k, n, dev_ctx.cublaslt_handle());
@@ -1124,12 +1143,12 @@ index 1f319c4ae3..9186eb6906 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
- 
+
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 index 6f03f76eeb..5fe2c3e7dc 100644
@@ -1137,13 +1156,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
- 
+
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
 diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
 index 7b85903776..3f4b298807 100644
 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -1175,11 +1194,10 @@ index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 @@ -14,7 +14,7 @@
- 
+
  #pragma once
- 
+
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
- 
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 0c84ada4b65..7f6d853df49 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -8,6 +8,8 @@ find_package(Python REQUIRED COMPONENTS Interpreter)
 set(PADDLE_LEGACY_TEST_PATH
     ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test)
 set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
+set(METAX_DEFAULT_TEST_FILE ${CMAKE_CURRENT_LIST_DIR}/default.txt)
+set(METAX_IGNORE_TEST_FILE ${CMAKE_CURRENT_LIST_DIR}/ignore.txt)
 
 set(NEED_REMOVE_KEYWORDS "attention")
 
@@ -18,7 +20,7 @@ if(NOT TEST_LIST_FILE)
     STATUS
       "<TEST_LIST_FILE> is not set, default test list [ ${CMAKE_CURRENT_LIST_DIR}/default.txt ] will be used."
   )
-  file(STRINGS ${CMAKE_CURRENT_LIST_DIR}/default.txt TEST_PROGRAMS)
+  file(STRINGS ${METAX_DEFAULT_TEST_FILE} TEST_PROGRAMS)
 
 else()
   if(NOT EXISTS ${TEST_LIST_FILE})
@@ -60,9 +62,8 @@ endforeach()
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 
 if(NOT TEST_LIST_FILE)
-  set(NEED_IGNORE_FILE ${CMAKE_CURRENT_LIST_DIR}/ignore.txt)
-  if(EXISTS ${NEED_IGNORE_FILE})
-    file(STRINGS ${NEED_IGNORE_FILE} NEED_IGNORE_TEST_PROGRAMS)
+  if(EXISTS ${METAX_IGNORE_TEST_FILE})
+    file(STRINGS ${METAX_IGNORE_TEST_FILE} NEED_IGNORE_TEST_PROGRAMS)
     foreach(test_name ${NEED_IGNORE_TEST_PROGRAMS})
       if(EXISTS ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
         list(REMOVE_ITEM PYTHON_TEST_SCRIPTS

From 73710c59915a9a1b91ab09b5d126400c74c7c205 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Fri, 26 Sep 2025 14:20:04 +0800
Subject: [PATCH 52/95] Revert "[Metax] fix phi::backends::gpu::DnnVersion()
 symbol not found"

This reverts commit 087a9c1240f024210d536e543a2fc55db1175529.
---
 backends/metax_gpu/patch/paddle.patch   | 216 +++++++++++-------------
 backends/metax_gpu/tests/CMakeLists.txt |   9 +-
 2 files changed, 103 insertions(+), 122 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 8b8ae26dbba..beefb730bf7 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644
 -  set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
 +  # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
  endif()
-
+ 
  set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
 diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
 index bff0f2bf70..9376b5781f 100644
 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc
 +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
 @@ -16,7 +16,7 @@
-
+ 
  #include <cstdio>
-
+ 
 -#include "paddle/fluid/platform/enforce.h"
 +// #include "paddle/fluid/platform/enforce.h"
  #include "paddle/phi/core/os_info.h"
@@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnDestroyActivationDescriptor);               \
 +  __macro(cudnnSetRNNDescriptor_v6);
  CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
+ 
  #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
@@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnRNNForwardInferenceEx);
  CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
-
+ 
 @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
-
+ 
 -#if CUDNN_VERSION < 90000
 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
 -  __macro(cudnnGetRNNParamsSize);                     \
@@ -132,7 +132,7 @@ index c0080f0a5e..458ca3e2e8 100644
 -#endif
  }  // namespace dynload
  }  // namespace phi
-
+ 
 diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
 index 1547909d92..ef20838434 100644
 --- a/paddle/phi/backends/dynload/cufft.h
@@ -140,7 +140,7 @@ index 1547909d92..ef20838434 100644
 @@ -1,3 +1,4 @@
 +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
          cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
@@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644
 @@ -24,8 +24,8 @@ limitations under the License. */
  #include "paddle/phi/backends/dynload/dynamic_loader.h"
  #include "paddle/phi/common/port.h"
-
+ 
 -namespace phi {
 -namespace dynload {
 +// namespace phi {
 +// namespace dynload {
-
+ 
  extern std::once_flag cupti_dso_flag;
  extern void *cupti_dso_handle;
 @@ -71,7 +71,7 @@ extern void *cupti_dso_handle;
  CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
-
+ 
  #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
 -}  // namespace dynload
 -}  // namespace phi
 +// }  // namespace dynload
 +// }  // namespace phi
-
+ 
 -#endif  // PADDLE_WITH_CUPTI
 +#endif  // PADDLE_WITH_CUPTI
 \ No newline at end of file
@@ -232,26 +232,26 @@ index 4ff2e528a9..23f7f4b583 100644
 @@ -1,3 +1,4 @@
 +// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
+ 
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -25,7 +26,7 @@ namespace phi {
  namespace backends {
  namespace gpu {
-
+ 
 -#define FULL_WARP_MASK 0xFFFFFFFF
 +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
  #define CREATE_SHFL_MASK(mask, predicate) \
    mask = __ballot_sync(FULL_WARP_MASK, (predicate))
-
+ 
 @@ -45,12 +46,12 @@ namespace gpu {
-
+ 
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
 +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) {
    return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
  }
-
+ 
  template <typename T>
 -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
 +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask,
@@ -259,7 +259,7 @@ index 4ff2e528a9..23f7f4b583 100644
                                                  int width = warpSize) {
    return __shfl_xor_sync(mask, val, width);
 @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::float16 val, int delta, int width) {
@@ -267,7 +267,7 @@ index 4ff2e528a9..23f7f4b583 100644
    return phi::dtype::float16(__shfl_down_sync(
        mask, val.to_half(), static_cast<unsigned>(delta), width));
  }
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
@@ -276,7 +276,7 @@ index 4ff2e528a9..23f7f4b583 100644
    return phi::dtype::bfloat16(__shfl_down_sync(
        mask, val.to_nv_bfloat16(), static_cast<unsigned>(delta), width));
 @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
@@ -285,7 +285,7 @@ index 4ff2e528a9..23f7f4b583 100644
        mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
    float imag = static_cast<float>(__shfl_down_sync(
 @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
@@ -294,14 +294,14 @@ index 4ff2e528a9..23f7f4b583 100644
        static_cast<double>(__shfl_down_sync(mask,
                                             static_cast<double>(val.real),
 @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::float16 val, int width) {
 +    unsigned long long mask, phi::dtype::float16 val, int width) {
    return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
  }
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int width) {
@@ -310,7 +310,7 @@ index 4ff2e528a9..23f7f4b583 100644
    return phi::dtype::bfloat16(
        __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
 @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<float> val, int width) {
@@ -319,7 +319,7 @@ index 4ff2e528a9..23f7f4b583 100644
        __shfl_xor_sync(mask, static_cast<float>(val.real), width));
    float imag = static_cast<float>(
 @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
-
+ 
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<double> val, int width) {
@@ -328,14 +328,14 @@ index 4ff2e528a9..23f7f4b583 100644
        __shfl_xor_sync(mask, static_cast<double>(val.real), width));
    double imag = static_cast<double>(
 @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
-
+ 
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
 +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) {
    return __shfl_sync(mask, val, src_line, width);
  }
-
+ 
 @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // but most card's warp size is 32.
    const int warpSize = 32;
@@ -343,34 +343,15 @@ index 4ff2e528a9..23f7f4b583 100644
 -  unsigned mask = 0u;
 +  unsigned long long mask = 0ull;
    CREATE_SHFL_MASK(mask, tid < len);
-
+ 
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
-diff --git a/paddle/phi/backends/gpu/cuda/cuda_info.cc b/paddle/phi/backends/gpu/cuda/cuda_info.cc
-index 99c9eb6ed0..875f1ef38b 100644
---- a/paddle/phi/backends/gpu/cuda/cuda_info.cc
-+++ b/paddle/phi/backends/gpu/cuda/cuda_info.cc
-@@ -25,12 +25,12 @@ static std::vector<phi::gpuDeviceProp> g_device_props;
-
- namespace phi::backends::gpu {
-
--#ifndef PADDLE_WITH_CUSTOM_DEVICE
-+// #ifndef PADDLE_WITH_CUSTOM_DEVICE
- int DnnVersion() {
-   if (!dynload::HasCUDNN()) return -1;
-   return dynload::cudnnGetVersion();  // NOLINT
- }
--#endif
-+// #endif
-
- static int GetGPUDeviceCountImpl() {
-   int driverVersion = 0;
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
 index 024a7de73e..1e4cdf16be 100644
 --- a/paddle/phi/core/enforce.h
 +++ b/paddle/phi/core/enforce.h
 @@ -45,7 +45,9 @@ limitations under the License. */
  #endif
-
+ 
  #ifdef PADDLE_WITH_CUDA
 -#include "paddle/phi/backends/dynload/cublas.h"
 +// #include "paddle/phi/backends/dynload/../../../../../cublas.h"
@@ -380,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644
  #include "paddle/phi/backends/dynload/curand.h"
  #include "paddle/phi/backends/dynload/cusolver.h"
 @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; }
-
+ 
  void ThrowWarnInternal(const std::string& message);
-
+ 
 -#if defined(__CUDA_ARCH__)
 +#if defined(__CUDACC__)
  // For cuda, the assertions can affect performance and it is therefore
@@ -398,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644
    } while (0)
  #elif defined(__HIPCC__)
 @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) {
-
+ 
  }  // namespace enforce
  using namespace enforce;  // NOLINT
 -}  // namespace phi
@@ -411,7 +392,7 @@ index c646e487d0..325122175c 100644
 @@ -25,8 +25,9 @@
  #else
  #include <cuda_runtime.h>
-
+ 
 -#include "paddle/phi/backends/dynload/cublas.h"
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublas.h"
@@ -419,16 +400,16 @@ index c646e487d0..325122175c 100644
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #include "paddle/phi/backends/dynload/cudnn.h"
  #endif
-
+ 
 @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
-
+ 
  // TODO(Ming Huang): Since there is no blasLt handler,
  // use rocblas_handle for workaround.
 -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
-
+ 
  #undef DECLARE_TYPE_FOR_GPU
-
+ 
 diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h
 index 2d02eb370b..8a7233e34e 100644
 --- a/paddle/phi/core/platform/device_context.h
@@ -449,53 +430,53 @@ index d69eb67d6f..1d8b6e9375 100644
 --- a/paddle/phi/kernels/cpu/index_select_impl.h
 +++ b/paddle/phi/kernels/cpu/index_select_impl.h
 @@ -18,7 +18,7 @@
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/backends/all_context.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/fc_functor.h"
-
+ 
  #include "paddle/phi/backends/gpu/gpu_launch_config.h"
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
 index 88663ec880..98b93072a3 100644
 --- a/paddle/phi/kernels/funcs/gru_compute.cu
 +++ b/paddle/phi/kernels/funcs/gru_compute.cu
 @@ -12,7 +12,7 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/gru_compute.h"
-
+ 
  #include "paddle/phi/backends/gpu/gpu_context.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 index 4eae698648..5c047723ea 100644
 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 +++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 @@ -43,11 +43,11 @@ template <typename T>
  using LayerNormParamType = typename CudnnDataType<T>::BatchNormParamType;
-
+ 
  inline static int GetDesiredBlockDim(int64_t block_dim) {
 -  const int kMaxBlockDim = 512;
 +  const int kMaxBlockDim = 256;
@@ -513,12 +494,12 @@ index 15e1a4a3c3..e4780538d7 100644
 +++ b/paddle/phi/kernels/funcs/math/context_project.h
 @@ -18,7 +18,7 @@
  #include <vector>
-
+ 
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/im2col.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
@@ -526,14 +507,14 @@ index e101224970..a52eb6096f 100644
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
  #include "paddle/phi/common/memory_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
-
+ 
  namespace phi {
  namespace funcs {
-
+ 
 +
 +
  template <typename Context, typename T>
@@ -551,19 +532,19 @@ index 558d363b39..05da04b517 100644
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
-
+ 
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-index 047f52bd91..a05b34d3ba 100644
+index 8b0baf5f5f..260482f124 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;
-
+ 
  #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
-
+ 
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..108edda7ca 100644
@@ -572,7 +553,7 @@ index e30d440ff3..108edda7ca 100644
 @@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
-
+ 
 -#define FINAL_MASK 0xffffffff
 +#define FINAL_MASK 0xffffffffffffffffull
  #ifdef PADDLE_WITH_HIP
@@ -582,7 +563,7 @@ index e30d440ff3..108edda7ca 100644
 +#define WARP_SIZE 64
  #endif
  #define MAX_NUM_THREADS 1024
-
+ 
 @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
@@ -643,7 +624,7 @@ index e30d440ff3..108edda7ca 100644
 +  topk[0 + offset].v = p.v;
 +  topk[0 + offset].id = p.id;
  }
-
+ 
  template <typename T, int BlockSize>
 @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
@@ -699,7 +680,7 @@ index e30d440ff3..108edda7ca 100644
 +            // topk + MaxLength - *beam, src, tid, dim, *max, length, largest);
        }
      }
-
+ 
 @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
@@ -734,7 +715,7 @@ index e30d440ff3..108edda7ca 100644
 -    if (--(*k) == 0) break;
 +    // if (--(*k) == 0) break;
 +    unsigned long long mask = 0ull;
-
+ 
 -    unsigned mask = 0u;
 +    // unsigned mask = 0u;
      CREATE_SHFL_MASK(mask, true);
@@ -758,7 +739,7 @@ index e30d440ff3..108edda7ca 100644
 +
      return ret;
    }
-
+ 
    static __device__ __forceinline__ unsigned int SetBitfield(
        unsigned int val, unsigned int to_insert, int pos, int len) {
      unsigned int ret;
@@ -780,7 +761,7 @@ index e30d440ff3..108edda7ca 100644
 +    ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
      return ret;
    }
-
+ 
 @@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
@@ -808,7 +789,7 @@ index e30d440ff3..108edda7ca 100644
 +  return ::__lane_id();
 +  // return lane_id;
  }
-
+ 
  __device__ __forceinline__ unsigned GetLaneMaskLe() {
    unsigned mask;
 -  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
@@ -817,17 +798,17 @@ index e30d440ff3..108edda7ca 100644
 +  return ((uint64_t(1) << ::__lane_id()) << 1) - 1;
 +  // return mask;
  }
-
+ 
  template <typename T, bool KillDependency, class Function>
 @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
-
+ 
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
 -  RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  // RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  RadixSearch<T, typename RadixTypeConfig<T>::RadixType, IndexType, false>(
        cur_input, k, num_cols, shared_mem, &kth_value);
-
+ 
    __shared__ int64_t block_min_idx;
 @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
@@ -840,12 +821,12 @@ index 32db61532f..0220316bc3 100644
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
  #pragma once
-
+ 
  #if defined(PADDLE_WITH_CUDA)
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #endif
-
+ 
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 index 9d4bb18d55..ea42cc10a9 100644
@@ -867,12 +848,12 @@ index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 @@ -14,7 +14,7 @@
-
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
-
+ 
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
@@ -880,19 +861,19 @@ index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 @@ -14,7 +14,7 @@
-
+ 
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
-
+ 
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu
-index e7b3d92449..f9403cc5dd 100644
+index 4c93778bde..c7bdf8a2cc 100644
 --- a/paddle/phi/kernels/gpu/correlation_kernel.cu
 +++ b/paddle/phi/kernels/gpu/correlation_kernel.cu
-@@ -112,7 +112,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx,
+@@ -103,7 +103,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx,
                             int stride2,
                             int corr_type_multiply,
                             DenseTensor *out) {
@@ -913,7 +894,7 @@ index f0cca0f701..02ea957240 100644
 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 +#include "kernels/gpudnn/conv_gpudnn.h"
 +#include "kernels/impl/conv_cudnn_impl.h"
-
+ 
  namespace phi {
  // To determine use cudnn or not.
 diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu
@@ -953,7 +934,7 @@ index 29fa252e96..4ae72b0935 100644
 +// #endif
    return tanhf(x);
  }
-
+ 
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -964,9 +945,9 @@ index 11efd87965..679db14c24 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
-
+ 
  namespace phi {
-
+ 
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -977,9 +958,9 @@ index 63c35dd4ee..15da9aea45 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
-
+ 
  namespace phi {
-
+ 
 diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
 index 1bdbe1564c..f753b54bc6 100644
 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -1024,7 +1005,7 @@ index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_grad_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -1037,14 +1018,14 @@ index 2789cb59a2..b91b076f7f 100644
 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 @@ -20,7 +20,7 @@ limitations under the License. */
-
+ 
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
-
+ 
 diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
 index 9a21c23666..86413d1577 100644
 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -1069,7 +1050,7 @@ index 4459a931da..837c8682b8 100644
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
 index ad9e9197dd..5478d9817d 100644
@@ -1089,27 +1070,27 @@ index e6b3960f6d..564125f1f6 100644
 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) {
-
+ 
  template <typename T>
  HOSTDEVICE T igamc(const T a, const T x) {
 -  static T big = 4.503599627370496e15;
 -  static T biginv = 2.22044604925031308085e-16;
 +  const static T big = 4.503599627370496e15;
 +  const static T biginv = 2.22044604925031308085e-16;
-
+ 
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
-
+ 
 diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 index 410fb3c560..009ce03440 100644
 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
-
+ 
  template <typename T>
  HOSTDEVICE T digamma(T x) {
 -  static T pi = T{3.14159265358979323846};
 +  const static T pi = T{3.14159265358979323846};
-
+ 
    if (x == T{0.0}) {
      T inf = std::numeric_limits<T>::infinity();
 diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
@@ -1125,11 +1106,11 @@ index 5ebbc8d2db..c7b6c338e2 100644
 +#include "kernels/funcs/blas/cublaslt.h"
 +#include "kernels/funcs/quant_dequant.h"
 +#include "kernels/metax_kernel/metax_context.h"
-
+ 
  #pragma once
-
+ 
 @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx,
-
+ 
    {
      auto helper =
 -        std::make_unique<CublasLtHelper>(m, k, n, dev_ctx.cublaslt_handle());
@@ -1143,12 +1124,12 @@ index 1f319c4ae3..9186eb6906 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 index 6f03f76eeb..5fe2c3e7dc 100644
@@ -1156,13 +1137,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
-
+ 
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
-
+ 
 diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
 index 7b85903776..3f4b298807 100644
 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -1194,10 +1175,11 @@ index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 @@ -14,7 +14,7 @@
-
+ 
  #pragma once
-
+ 
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
+ 
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 7f6d853df49..0c84ada4b65 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -8,8 +8,6 @@ find_package(Python REQUIRED COMPONENTS Interpreter)
 set(PADDLE_LEGACY_TEST_PATH
     ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test)
 set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
-set(METAX_DEFAULT_TEST_FILE ${CMAKE_CURRENT_LIST_DIR}/default.txt)
-set(METAX_IGNORE_TEST_FILE ${CMAKE_CURRENT_LIST_DIR}/ignore.txt)
 
 set(NEED_REMOVE_KEYWORDS "attention")
 
@@ -20,7 +18,7 @@ if(NOT TEST_LIST_FILE)
     STATUS
       "<TEST_LIST_FILE> is not set, default test list [ ${CMAKE_CURRENT_LIST_DIR}/default.txt ] will be used."
   )
-  file(STRINGS ${METAX_DEFAULT_TEST_FILE} TEST_PROGRAMS)
+  file(STRINGS ${CMAKE_CURRENT_LIST_DIR}/default.txt TEST_PROGRAMS)
 
 else()
   if(NOT EXISTS ${TEST_LIST_FILE})
@@ -62,8 +60,9 @@ endforeach()
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 
 if(NOT TEST_LIST_FILE)
-  if(EXISTS ${METAX_IGNORE_TEST_FILE})
-    file(STRINGS ${METAX_IGNORE_TEST_FILE} NEED_IGNORE_TEST_PROGRAMS)
+  set(NEED_IGNORE_FILE ${CMAKE_CURRENT_LIST_DIR}/ignore.txt)
+  if(EXISTS ${NEED_IGNORE_FILE})
+    file(STRINGS ${NEED_IGNORE_FILE} NEED_IGNORE_TEST_PROGRAMS)
     foreach(test_name ${NEED_IGNORE_TEST_PROGRAMS})
       if(EXISTS ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
         list(REMOVE_ITEM PYTHON_TEST_SCRIPTS

From 78946fd334dacbdb3f8ba9b07d9273a8462e8512 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Fri, 26 Sep 2025 15:48:08 +0800
Subject: [PATCH 53/95] [metax] modify kernels (#67)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas

* modify blas

* modify blas

* modify blas

* modify context

* modify kernels
---
 .../fused_conv2d_add_act_kernel_register.cu   |  0
 .../fused_rope_grad_kernel_register.cu        |  0
 .../fused_rope_kernel_register.cu             |  0
 .../kernels/metax_kernel/metax_context.cc     | 26 -------------------
 .../kernels/metax_kernel/metax_context.h      |  3 +--
 5 files changed, 1 insertion(+), 28 deletions(-)
 rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_conv2d_add_act_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_rope_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{metax_kernel => fusion}/fused_rope_kernel_register.cu (100%)

diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_conv2d_add_act_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_conv2d_add_act_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_rope_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu
rename to backends/metax_gpu/kernels/fusion/fused_rope_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index efddba5f00b..0712fb75bbe 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -15,24 +15,6 @@
 #include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
-const bool allow_tf32_cublas = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
-  if (v) {
-    return std::atoi(v);
-  }
-  return true;
-}();
-
-const bool allow_tf32_cudnn = []() -> bool {
-  const char* v = std::getenv("ALLOW_TF32_CUDNN");
-  if (v) {
-    return std::atoi(v);
-  }
-  return false;
-}();
-
-bool AllowTF32Cublas() { return allow_tf32_cublas; }
-bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
     size_t required_workspace_bytes,
@@ -42,19 +24,11 @@ void DnnWorkspaceHandle::RunFuncSync(
     void* workspace_ptr = nullptr;
     size_t size = ((required_workspace_bytes + 255) >> 8) << 8;
     std::lock_guard<std::mutex> guard(*mtx_);
-#ifdef PADDLE_WITH_HIP
-    auto status = hipMalloc(&workspace_ptr, size);
-#else
     auto status = cudaMalloc(&workspace_ptr, size);
-#endif
     if (status == gpuSuccess) {
       cudnn_func(workspace_ptr);
       phi::backends::gpu::GpuStreamSync(stream_);
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(hipFree(workspace_ptr));
-#else
       PADDLE_ENFORCE_GPU_SUCCESS(cudaFree(workspace_ptr));
-#endif
       return;
     }
   }
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 2d761439089..7386811a236 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -18,6 +18,7 @@
 #include <mutex>
 
 #include "kernels/funcs/blas/cublasLt.h"
+#include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
@@ -30,8 +31,6 @@
 cublasLtHandle_t GetBlasLtHandle();
 
 namespace phi {
-bool AllowTF32Cublas();
-bool AllowTF32Cudnn();
 class DnnWorkspaceHandle {
  public:
   inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream)

From ac78af20874e28a7d5c3f1beed40762c716213bb Mon Sep 17 00:00:00 2001
From: Theendlessofhell <148317258+Theendlessofhell@users.noreply.github.com>
Date: Fri, 26 Sep 2025 15:48:59 +0800
Subject: [PATCH 54/95] Fix part of the missing kernel issues (#66)

Co-authored-by: root <root@lt-wks-10-0-180-15.pub.metax-tech.com>
---
 .../kernels/cuda_kernels/multinomial_kernel_register.cu      | 3 ++-
 .../kernels/cuda_kernels/take_along_axis_kernel_register.cu  | 5 ++++-
 .../metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu  | 1 +
 .../kernels/metax_kernel/layer_norm_grad_kernel_register.cu  | 1 +
 4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu
index 622e70728f1..1325fa339b0 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/multinomial_kernel_register.cu
@@ -21,6 +21,7 @@ PD_CUSTOM_KERNEL_REGISTER(multinomial,
                           phi::MultinomialKernel,
                           phi::dtype::float16,
                           phi::dtype::bfloat16,
-                          float) {
+                          float,
+                          double) {
   kernel->OutputAt(0).SetDataType(phi::DataType::INT64);
 }
diff --git a/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu
index 4b23b0820fc..b628552aaaf 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/take_along_axis_kernel_register.cu
@@ -25,4 +25,7 @@ PD_CUSTOM_KERNEL_REGISTER(take_along_axis,
                           int64_t,
                           int,
                           phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::dtype::bfloat16,
+                          uint8_t,  // 支持 uint8
+                          int16_t   // 支持 int16
+) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
index 287fa8de41a..ead21b1eb7e 100644
--- a/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
@@ -22,5 +22,6 @@ PD_REGISTER_PLUGIN_KERNEL(addmm,
                           ALL_LAYOUT,
                           phi::AddmmKernel,
                           float,
+                          double,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
index 87c06dab2a4..857dcb6d522 100644
--- a/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
@@ -115,6 +115,7 @@ PD_REGISTER_PLUGIN_KERNEL(layer_norm_grad,
                           ALL_LAYOUT,
                           phi::LayerNormGradKernel,
                           float,
+                          double,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {
   if (kernel_key.dtype() == phi::DataType::FLOAT16) {

From 404ff3de981a1d2f1d0b3fb36d6c6d41daea001f Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Fri, 26 Sep 2025 18:07:16 +0800
Subject: [PATCH 55/95] [Metax] fix index_elementwise_get kernel

---
 backends/metax_gpu/CMakeLists.txt                      |  2 +-
 .../index_elementwise_get_kernel_register.cu           | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index bca1ce7aad4..3b74ae39c18 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -326,7 +326,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/increment_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
index 5ab3d2a3170..a45a740fc61 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/index_elementwise_get_kernel.h"
+#include "paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu"  // NOLINT
 
 PD_CUSTOM_KERNEL_REGISTER(index_elementwise_get,
                           metax_gpu,
@@ -27,7 +27,7 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_get,
                           int64_t,
                           int16_t,
                           uint8_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}

From 4ce9fe6de10402f04917cae8bd0f83bf499bdf1e Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Fri, 26 Sep 2025 18:18:36 +0800
Subject: [PATCH 56/95] [Metax] fix index_elementwise_get kernel (#68)

* [Metax] add keyword filter in CI CMakeLists.txt

* [Metax] add ignore case list

* [Metax] fix phi::backends::gpu::DnnVersion() symbol not found

* Revert "[Metax] fix phi::backends::gpu::DnnVersion() symbol not found"

This reverts commit 087a9c1240f024210d536e543a2fc55db1175529.

* [Metax] fix index_elementwise_get kernel
---
 backends/metax_gpu/CMakeLists.txt                      |  2 +-
 .../index_elementwise_get_kernel_register.cu           | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index bca1ce7aad4..3b74ae39c18 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -326,7 +326,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/increment_kernel.cu
-  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
index 5ab3d2a3170..a45a740fc61 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_get_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/index_elementwise_get_kernel.h"
+#include "paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu"  // NOLINT
 
 PD_CUSTOM_KERNEL_REGISTER(index_elementwise_get,
                           metax_gpu,
@@ -27,7 +27,7 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_get,
                           int64_t,
                           int16_t,
                           uint8_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}

From 3c8d0173075d49bef48a909a39f12d325e276f00 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 29 Sep 2025 10:42:05 +0800
Subject: [PATCH 57/95] [metax]fix patch and fix missing kernel (#72)

* [metax]fix patch and fix missing kernel
---
 backends/metax_gpu/CMakeLists.txt             |  3 +
 .../cuda_kernels/adam_kernel_selected_rows.cu | 41 ++++++++++++
 .../cuda_kernels/einsum_kernel_register.cu    | 16 ++---
 .../lars_momentum_kernel_register.cu          | 29 +++++++++
 .../cuda_kernels/nonzero_kernel_register.cu   |  8 ++-
 .../put_along_axis_kernel_register.cu         |  6 +-
 backends/metax_gpu/patch/paddle.patch         | 65 -------------------
 7 files changed, 90 insertions(+), 78 deletions(-)
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 3b74ae39c18..5930eaaebd2 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -535,6 +535,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/clip_by_norm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/uniform_random_batch_size_like_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/get_tensor_from_selected_rows_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/batch_norm_kernel.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -642,6 +643,8 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_sum_kernel.cu
   # ############################################################################
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
   # kernels/kps
diff --git a/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu b/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu
new file mode 100644
index 00000000000..df4105efbd2
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/adam_kernel_selected_rows.cu
@@ -0,0 +1,41 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/selected_rows_functor.h"
+#include "paddle/phi/kernels/selected_rows/adam_kernel.h"
+
+PD_CUSTOM_KERNEL_REGISTER(adam_dense_param_sparse_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::sr::AdamDenseParamSparseGradKernel,
+                          float,
+                          double,
+                          phi::float16) {
+  // Skip beta1_pow, beta2_pow, skip_update data transform
+  kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(7).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(9).SetBackend(phi::Backend::ALL_BACKEND);
+
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(5).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(6).SetDataType(phi::DataType::FLOAT32);
+  }
+  kernel->OutputAt(4).SetBackend(phi::Backend::UNDEFINED);
+  kernel->OutputAt(5).SetBackend(phi::Backend::UNDEFINED);
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu
index 444928af78f..0f613b55e9e 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/einsum_kernel_register.cu
@@ -23,10 +23,10 @@ PD_CUSTOM_KERNEL_REGISTER(einsum,
                           phi::EinsumKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(einsum_infer,
                           metax_gpu,
@@ -34,7 +34,7 @@ PD_CUSTOM_KERNEL_REGISTER(einsum_infer,
                           phi::EinsumInferKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu
new file mode 100644
index 00000000000..5647c806bfd
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/lars_momentum_kernel_register.cu
@@ -0,0 +1,29 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/lars_momentum_kernel.h"
+
+PD_CUSTOM_KERNEL_REGISTER(lars_momentum,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::LarsMomentumKernel,
+                          float,
+                          double,
+                          phi::float16) {
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  }
+}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu
index 1f84b628e84..dc92b2c6d69 100755
--- a/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/nonzero_kernel_register.cu
@@ -23,11 +23,13 @@ PD_CUSTOM_KERNEL_REGISTER(nonzero,
                           int64_t,
                           int,
                           int16_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
+                          phi::float16,
+                          phi::bfloat16,
                           bool,
                           float,
-                          double) {
+                          double,
+                          phi::complex64,
+                          phi::complex128) {
   kernel->OutputAt(0).SetDataType(phi::DataType::INT64);
 }
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu
index 8ff1f5959ab..ca93a8ca079 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/put_along_axis_kernel_register.cu
@@ -23,6 +23,8 @@ PD_CUSTOM_KERNEL_REGISTER(put_along_axis,
                           float,
                           double,
                           int64_t,
+                          uint8_t,
+                          int16_t,
                           int,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index beefb730bf7..4c06609338c 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -869,19 +869,6 @@ index e838778952..83e805e75a 100644
  
  namespace phi {
  namespace fusion {
-diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu
-index 4c93778bde..c7bdf8a2cc 100644
---- a/paddle/phi/kernels/gpu/correlation_kernel.cu
-+++ b/paddle/phi/kernels/gpu/correlation_kernel.cu
-@@ -103,7 +103,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx,
-                            int stride2,
-                            int corr_type_multiply,
-                            DenseTensor *out) {
--  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
-+  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM;
-   PADDLE_ENFORCE_EQ(
-       is_gpu_place,
-       true,
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
 index f0cca0f701..02ea957240 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -897,19 +884,6 @@ index f0cca0f701..02ea957240 100644
  
  namespace phi {
  // To determine use cudnn or not.
-diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu
-index c2ddfa1347..c6adf5a6de 100644
---- a/paddle/phi/kernels/gpu/dgc_kernel.cu
-+++ b/paddle/phi/kernels/gpu/dgc_kernel.cu
-@@ -188,7 +188,7 @@ void DGCKernel(const Context& dev_ctx,
-   int buf_size = paddle::communication::dgc::get_buffer_size(k);
-   phi::Allocator::AllocationPtr tmp_ious_data;
- #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
--  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-+  if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
-     tmp_ious_data = phi::memory_utils::Alloc(
-         dev_ctx.GetPlace(),
-         buf_size,
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
 index 29fa252e96..4ae72b0935 100644
 --- a/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -974,19 +948,6 @@ index 1bdbe1564c..f753b54bc6 100644
  #include "paddle/phi/kernels/impl/qr_kernel_impl.h"
  #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
  #include "paddle/phi/kernels/lstsq_kernel.h"
-diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
-index 05a977828f..5136608c41 100644
---- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
-+++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu
-@@ -58,7 +58,7 @@ void ShuffleBatchKernel(const Context& dev_ctx,
-   int64_t seed_int = 0;
-   if (seed.initialized()) {
-     const auto& seed_place = seed.place().GetType();
--    bool is_gpu_place = seed_place == phi::AllocationType::GPU;
-+    bool is_gpu_place = seed_place == phi::AllocationType::GPU || seed_place == phi::AllocationType::CUSTOM;
-     if (is_gpu_place) {
-       // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would
-       // not be CUDAPlace in practice. This case would only happen in Python
 diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
 index 9bc5326c90..79b57a8203 100644
 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
@@ -1144,32 +1105,6 @@ index 6f03f76eeb..5fe2c3e7dc 100644
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
  
-diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
-index 7b85903776..3f4b298807 100644
---- a/paddle/phi/kernels/impl/merged_momentum_impl.h
-+++ b/paddle/phi/kernels/impl/merged_momentum_impl.h
-@@ -297,7 +297,7 @@ void MergedMomentumInnerCompute(
-                 params_out[idx],
-                 velocities_out[idx]);
-         VLOG(10) << "Launch MergedMomentum cpu kernel.";
--      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-+      } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
-         phi::funcs::ForRange<Context> for_range(
-             static_cast<const Context &>(dev_ctx), params[idx]->numel());
-         const auto grad_type = grads[idx]->dtype();
-diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h
-index de5bcfc30b..eb2a9714f5 100644
---- a/paddle/phi/kernels/impl/momentum_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h
-@@ -457,7 +457,7 @@ void MomentumDenseImpl(const Context& dev_ctx,
-             regularization_coeff,
-             param_out,
-             velocity_out);
--  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) {
-+  } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) {
-     funcs::ForRange<Context> for_range(dev_ctx, param.numel());
-     const auto grad_type = grad.dtype();
- #define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type)     \
 diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h

From 7303ae2c86253711559c2fe2f0abbc770541fe5e Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 29 Sep 2025 17:08:34 +0800
Subject: [PATCH 58/95] [metax] modify kernels (#73)

* modify kernels
---
 .../kernels/impl/addmm_kernel_impl.h          |  1 +
 backends/metax_gpu/patch/paddle.patch         | 60 ++++++++++++++++++-
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
index fb1368b069c..b517b719d49 100644
--- a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
@@ -98,6 +98,7 @@ void AddmmKernel(const Context& dev_ctx,
           y_dims[0]));
 
   dev_ctx.template Alloc<T>(out);
+  if (out->numel() == 0) return;
   auto blas = funcs::GetBlas<Context, T>(dev_ctx);
 
   // calc broadcast dim
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 4c06609338c..69d714ef6e0 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -438,6 +438,21 @@ index d69eb67d6f..1d8b6e9375 100644
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  
+diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h
+index 461e6e2474..48a64ae9ce 100644
+--- a/paddle/phi/kernels/funcs/embedding_grad.h
++++ b/paddle/phi/kernels/funcs/embedding_grad.h
+@@ -143,8 +143,8 @@ void LaunchEmbeddingGradDeterministicKernel(const GPUContext& dev_ctx,
+   constexpr int kWarpSize = 64;
+   constexpr int kBlockDimY = 16;
+ #else
+-  constexpr int kWarpSize = 32;
+-  constexpr int kBlockDimY = 32;
++  constexpr int kWarpSize = 64;
++  constexpr int kBlockDimY = 16;
+ #endif
+   dim3 threads(kWarpSize, kBlockDimY);
+   dim3 grids(static_cast<int>((D + kWarpSize - 1) / kWarpSize));
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
@@ -501,6 +516,49 @@ index 15e1a4a3c3..e4780538d7 100644
  #include "paddle/phi/kernels/funcs/im2col.h"
  
  namespace phi {
+diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
+index e5361b836e..5ad238df08 100644
+--- a/paddle/phi/kernels/funcs/math_cuda_utils.h
++++ b/paddle/phi/kernels/funcs/math_cuda_utils.h
+@@ -175,12 +175,12 @@ struct KeyValuePair<half> {
+ #define WARP_SIZE_WIDTH_MASK 0x3f
+ typedef u_int64_t warp_mask_t;
+ #else
+-#define FINAL_MASK 0xffffffff
+-#define HALF_WARP 16
+-#define WARP_SIZE 32
+-#define WARP_SIZE_WIDTH 5
+-#define WARP_SIZE_WIDTH_MASK 0x1f
+-typedef unsigned warp_mask_t;
++#define FINAL_MASK 0xffffffffffffffffUL
++#define HALF_WARP 32
++#define WARP_SIZE 64
++#define WARP_SIZE_WIDTH 6
++#define WARP_SIZE_WIDTH_MASK 0x3f
++typedef u_int64_t warp_mask_t;
+ #endif
+ 
+ template <typename T>
+@@ -200,19 +200,13 @@ __inline__ __device__ T BlockReduceSum(T val, warp_mask_t mask) {
+   static __shared__ T shared[WARP_SIZE];
+   int lane = threadIdx.x & WARP_SIZE_WIDTH_MASK;
+   int wid = threadIdx.x >> WARP_SIZE_WIDTH;
+-
+   val = WarpReduceSum<T>(val, mask);
+-
+-  __syncthreads();
+   if (lane == 0) shared[wid] = val;
+-
+   __syncthreads();
+-
+   // align block_span to warpSize
+   int block_span = (blockDim.x + warpSize - 1) >> WARP_SIZE_WIDTH;
+   val = (lane < block_span) ? shared[lane] : static_cast<T>(0.0f);
+   val = WarpReduceSum<T>(val, mask);
+-
+   return val;
+ }
+ 
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu
@@ -534,7 +592,7 @@ index 558d363b39..05da04b517 100644
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
  
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-index 8b0baf5f5f..260482f124 100644
+index 047f52bd91..a05b34d3ba 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;

From 8b184a32bd9e02c0d8b405d670a8e888a4522f42 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 29 Sep 2025 18:11:03 +0800
Subject: [PATCH 59/95] [metax] modify kernels (#74)

* modify kernels
---
 .../gpudnn/conv_grad_kernel_register.cu       | 37 ++++++++-----------
 .../kernels/gpudnn/conv_kernel_register.cu    | 19 +++++-----
 .../kernels/gpudnn/conv_transpose_kernel.cu   | 15 ++++----
 .../depthwise_conv_grad_kernel.cu             | 14 +++----
 .../metax_kernel/depthwise_conv_kernel.cu     | 14 +++----
 5 files changed, 45 insertions(+), 54 deletions(-)

diff --git a/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
index e4acb2f95b6..2da42c7ff8c 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu
@@ -437,26 +437,22 @@ void ConvCudnnGradKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(filter_grad);
   }
 
-  //   bool has_use_addto = dev_ctx.HasDnnAttr("use_addto");
-  bool has_use_addto = "true";
+  bool has_use_addto = dev_ctx.HasDnnAttr("use_addto");
   VLOG(4) << "GPUContext contains `use_addto`: " << has_use_addto;
-  //   bool use_addto = has_use_addto
-  //                        ? PADDLE_GET_CONST(bool, "true")
-  //                        : false;
-  bool use_addto = "true";
+  bool use_addto = has_use_addto
+                       ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("use_addto"))
+                       : false;
   std::vector<int> dilations = dilations_t;
   std::vector<int> strides = strides_t;
   std::vector<int> paddings = paddings_t;
 
-  //   bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  bool has_exhaustive_search = "true";
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
   VLOG(4) << "GPUContext contains `exhaustive_search`: "
           << has_exhaustive_search;
-  //   bool exhaustive_search_attr =
-  //       has_exhaustive_search
-  //           ? PADDLE_GET_CONST(bool, "true")
-  //           : false;
-  bool exhaustive_search_attr = "true";
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
   bool exhaustive_search =
       FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
   bool deterministic = FLAGS_cudnn_deterministic;
@@ -835,14 +831,13 @@ void ConvCudnnGradGradKernel(
   T* transformed_dx = nullptr;
   std::vector<int> dilations = dilations_t;
 
-  //   bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  //   VLOG(4) << "GPUContext contains `exhaustive_search`: "
-  //           << has_exhaustive_search;
-  //   bool exhaustive_search_attr =
-  //       has_exhaustive_search
-  //           ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
-  //           : false;
-  bool exhaustive_search_attr = "true";
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
+  VLOG(4) << "GPUContext contains `exhaustive_search`: "
+          << has_exhaustive_search;
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
   bool exhaustive_search =
       FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
   bool deterministic = FLAGS_cudnn_deterministic;
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
index 0a83b504c76..d6b243c956c 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu
@@ -228,15 +228,16 @@ void ConvCudnnKernel(const Context& dev_ctx,
   std::vector<int> paddings = paddings_t;
   std::vector<int> dilations = dilations_t;
 
-  // bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  // VLOG(4) << "GPUContext contains `exhaustive_search`: "
-  //         << has_exhaustive_search;
-  // bool exhaustive_search_attr =
-  //     has_exhaustive_search
-  //         ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
-  //         : false;
-
-  bool exhaustive_search = FLAGS_cudnn_exhaustive_search;
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
+  VLOG(4) << "GPUContext contains `exhaustive_search`: "
+          << has_exhaustive_search;
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
+
+  bool exhaustive_search =
+      FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
   bool deterministic = FLAGS_cudnn_deterministic;
 
   PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
index 532b7af0db4..4049d2f3130 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu
@@ -260,14 +260,13 @@ void ConvTransposeRawGPUDNNKernel(const Context& dev_ctx,
     return;
   }
 
-  // bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
-  // bool exhaustive_search_attr =
-  //     has_exhaustive_search
-  //         ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
-  //         : false;
-  // bool exhaustive_search =
-  //     FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
-  bool exhaustive_search = FLAGS_cudnn_exhaustive_search;
+  bool has_exhaustive_search = dev_ctx.HasDnnAttr("exhaustive_search");
+  bool exhaustive_search_attr =
+      has_exhaustive_search
+          ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("exhaustive_search"))
+          : false;
+  bool exhaustive_search =
+      FLAGS_cudnn_exhaustive_search || exhaustive_search_attr;
 
   bool deterministic = FLAGS_cudnn_deterministic;
   PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
diff --git a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
index f2475298963..4e5f881385a 100644
--- a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
@@ -54,14 +54,12 @@ void DepthwiseConvGradKernel(const Context& dev_ctx,
     return;
   }
 
-  // bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
-  // bool fuse_relu =
-  //     has_fuse_relu
-  //         ? PADDLE_GET_CONST(
-  //               bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
-  //         : false;
-  bool has_fuse_relu = false;
-  bool fuse_relu = false;
+  bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
+  bool fuse_relu =
+      has_fuse_relu
+          ? PADDLE_GET_CONST(
+                bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
+          : false;
 
   std::vector<int> strides = strides_t;
   std::vector<int> paddings = paddings_t;
diff --git a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
index 517f26b1c02..d3d6c4a4edd 100644
--- a/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
@@ -48,14 +48,12 @@ void DepthwiseConvKernel(const Context& dev_ctx,
 
   const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC");
 
-  // bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
-  // bool fuse_relu =
-  //     has_fuse_relu
-  //         ? PADDLE_GET_CONST(
-  //               bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
-  //         : false;
-  bool has_fuse_relu = false;
-  bool fuse_relu = false;
+  bool has_fuse_relu = dev_ctx.HasDnnAttr("fuse_relu_before_depthwise_conv");
+  bool fuse_relu =
+      has_fuse_relu
+          ? PADDLE_GET_CONST(
+                bool, dev_ctx.GetDnnAttr("fuse_relu_before_depthwise_conv"))
+          : false;
 
   if (channel_last) {
     PADDLE_ENFORCE_EQ(

From 60f0ed637f73305e8f0fbd03917e3c8e2978d1ef Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 11:33:54 +0800
Subject: [PATCH 60/95] [metax] link mccl and fix missing kernel (#76)

* [metax] link mccl and fix missing kernel
---
 backends/metax_gpu/CMakeLists.txt             |   7 +
 .../cross_entropy_bwd_w_downcast.cu           | 291 ++++++++++++
 .../embedding_grad_add_to_kernel.cu           |  27 ++
 .../cuda_kernels/gammaln_grad_kernel.cu       |  28 ++
 .../moe_combine_no_weight_grad_kernel.cu      |  25 +
 .../cuda_kernels/multihead_matmul_kernel.cu   | 433 ++++++++++++++++++
 backends/metax_gpu/kernels/funcs/generator.cc | 287 ++++++++++++
 .../kernels/impl/gammaln_grad_kernel_impl.h   | 112 +++++
 .../metax_kernel/cudnn_lstm_grad_kernel.cu    | 362 +++++++++++++++
 .../kernels/metax_kernel/cudnn_lstm_kernel.cu | 428 +++++++++++++++++
 backends/metax_gpu/tests/ignore.txt           |   4 +
 11 files changed, 2004 insertions(+)
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/funcs/generator.cc
 create mode 100644 backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu
 create mode 100644 backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 5930eaaebd2..2bb282cf54f 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -326,6 +326,8 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/im2sequence_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/increment_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu
+  # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/cross_entropy_bwd_w_downcast.cu
   # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_get_grad_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu
@@ -728,6 +730,11 @@ target_link_libraries(
   ${WARPCTC_LIBRARIES}
   ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
+
+target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
+target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
+target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so)
+
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
 
 target_compile_definitions(
diff --git a/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu b/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu
new file mode 100644
index 00000000000..a0d5dfd7a5a
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/cross_entropy_bwd_w_downcast.cu
@@ -0,0 +1,291 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/cross_entropy_grad_kernel.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "kernels/gpudnn/softmax_gpudnn.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
+
+namespace phi {
+
+/*
+  Vectorized wrapper of softmax with cross entropy grad hard label.
+  Optimized with float4 vectorization for memory coalescing and improved
+  throughput.
+*/
+template <typename T, typename LabelT, typename LogitT>
+__global__ void SoftmaxWithCrossEntropyGradHardLabelVectorized(
+    LogitT* __restrict__ logits_grad,
+    const T* __restrict__ loss_grad,
+    const T* __restrict__ softmax,
+    const LabelT* __restrict__ labels,
+    const int64_t n,
+    const int64_t dim,
+    const int64_t d,
+    const int ignore_index) {
+  // Vectorized load/store with float4 for 128-bit memory transactions
+  constexpr int VEC_SIZE = 4;
+  using VecT = typename phi::AlignedVector<LogitT, VEC_SIZE>;
+  using SoftmaxVecT = typename phi::AlignedVector<T, VEC_SIZE>;
+
+  int64_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int64_t vec_id = tid * VEC_SIZE;
+
+  // Ensure we don't exceed bounds
+  if (vec_id >= n * dim * d) return;
+
+  // Compute indices for vectorized access
+  int64_t idx_n = vec_id / (d * dim);
+  int64_t idx_dim_start = (vec_id / d) % dim;
+  int64_t idx_d = vec_id % d;
+  int64_t ids = idx_n * d + idx_d;
+
+  // Load label once per thread
+  auto lbl = static_cast<int64_t>(labels[ids]);
+
+  if (lbl == ignore_index) {
+    // Vectorized zero fill for ignore_index
+    VecT* vec_grad = reinterpret_cast<VecT*>(&logits_grad[vec_id]);
+    VecT zero_vec;
+#pragma unroll
+    for (int i = 0; i < VEC_SIZE; ++i) {
+      zero_vec.val[i] = static_cast<LogitT>(0.0f);
+    }
+    *vec_grad = zero_vec;
+    return;
+  }
+
+  // Vectorized load of softmax values
+  SoftmaxVecT softmax_vec;
+  const SoftmaxVecT* softmax_ptr =
+      reinterpret_cast<const SoftmaxVecT*>(&softmax[vec_id]);
+  softmax_vec = *softmax_ptr;
+
+  // Load loss gradient (broadcast across vector elements)
+  T loss_grad_val = loss_grad[ids];
+
+  // Vectorized computation
+  VecT grad_vec;
+#pragma unroll
+  for (int i = 0; i < VEC_SIZE; ++i) {
+    int64_t current_dim = idx_dim_start + i;
+    if (current_dim < dim) {  // Bounds check for partial vectors
+      float softmax_val = static_cast<float>(softmax_vec.val[i]);
+      float grad_val;
+
+      if (lbl == current_dim) {
+        grad_val = (softmax_val - 1.0f) * static_cast<float>(loss_grad_val);
+      } else {
+        grad_val = softmax_val * static_cast<float>(loss_grad_val);
+      }
+
+      grad_vec.val[i] = static_cast<LogitT>(grad_val);
+    } else {
+      grad_vec.val[i] = static_cast<LogitT>(0.0f);
+    }
+  }
+
+  // Vectorized store
+  VecT* grad_ptr = reinterpret_cast<VecT*>(&logits_grad[vec_id]);
+  *grad_ptr = grad_vec;
+}
+
+/*
+  Specialized kernel for dimensions not divisible by vector size
+  Uses warp-level primitives for better performance on irregular sizes
+*/
+template <typename T, typename LabelT, typename LogitT>
+__global__ void SoftmaxWithCrossEntropyGradHardLabelWarp(
+    LogitT* __restrict__ logits_grad,
+    const T* __restrict__ loss_grad,
+    const T* __restrict__ softmax,
+    const LabelT* __restrict__ labels,
+    const int64_t n,
+    const int64_t dim,
+    const int64_t d,
+    const int ignore_index) {
+  const int warps_per_block = 4;
+  const int threads_per_warp = 32;
+  const int threads_per_block = warps_per_block * threads_per_warp;
+
+  int tid = blockIdx.x * threads_per_block + threadIdx.x;
+  int warp_id = threadIdx.x / threads_per_warp;
+  int lane_id = threadIdx.x % threads_per_warp;
+
+  // Process multiple elements per thread using warp-level parallelism
+  int64_t elements_per_thread =
+      (n * dim * d + gridDim.x * threads_per_block - 1) /
+      (gridDim.x * threads_per_block);
+
+  for (int e = 0; e < elements_per_thread; ++e) {
+    int64_t idx = tid + e * gridDim.x * threads_per_block;
+    if (idx >= n * dim * d) break;
+
+    int64_t idx_n = idx / (d * dim);
+    int64_t idx_dim = (idx / d) % dim;
+    int64_t idx_d = idx % d;
+    int64_t ids = idx_n * d + idx_d;
+
+    auto lbl = static_cast<int64_t>(labels[ids]);
+
+    if (lbl == ignore_index) {
+      logits_grad[idx] = static_cast<LogitT>(0.0f);
+    } else if (lbl == idx_dim) {
+      logits_grad[idx] =
+          static_cast<LogitT>((static_cast<float>(softmax[idx]) - 1.0f) *
+                              static_cast<float>(loss_grad[ids]));
+    } else {
+      logits_grad[idx] =
+          static_cast<LogitT>(static_cast<float>(softmax[idx]) *
+                              static_cast<float>(loss_grad[ids]));
+    }
+  }
+}
+
+/*
+  Optimized kernel selector based on problem size and alignment
+*/
+template <typename T, typename LabelT, typename LogitT>
+void LaunchOptimizedCrossEntropyGradKernel(const GPUContext& dev_ctx,
+                                           LogitT* logits_grad,
+                                           const T* loss_grad,
+                                           const T* softmax,
+                                           const LabelT* labels,
+                                           const int64_t n,
+                                           const int64_t dim,
+                                           const int64_t d,
+                                           const int ignore_index) {
+  const int64_t total_elements = n * dim * d;
+  auto stream = dev_ctx.stream();
+
+  // Check alignment for vectorized kernel
+  bool is_aligned = (reinterpret_cast<uintptr_t>(logits_grad) % 16 == 0) &&
+                    (reinterpret_cast<uintptr_t>(softmax) % 16 == 0) &&
+                    (total_elements % 4 == 0);
+
+  if (is_aligned && total_elements >= 1024) {
+    // Use vectorized kernel for aligned, large problems
+    constexpr int VEC_SIZE = 4;
+    const int threads_per_block = 256;
+    const int vec_elements = total_elements / VEC_SIZE;
+    const int blocks =
+        (vec_elements + threads_per_block - 1) / threads_per_block;
+
+    SoftmaxWithCrossEntropyGradHardLabelVectorized<T, LabelT, LogitT>
+        <<<blocks, threads_per_block, 0, stream>>>(
+            logits_grad, loss_grad, softmax, labels, n, dim, d, ignore_index);
+  } else {
+    // Use warp-specialized kernel for irregular sizes
+    const int warps_per_block = 4;
+    const int threads_per_block = warps_per_block * 32;
+    const int blocks =
+        std::min(1024,
+                 static_cast<int>((total_elements + threads_per_block - 1) /
+                                  threads_per_block));
+
+    SoftmaxWithCrossEntropyGradHardLabelWarp<T, LabelT, LogitT>
+        <<<blocks, threads_per_block, 0, stream>>>(
+            logits_grad, loss_grad, softmax, labels, n, dim, d, ignore_index);
+  }
+}
+
+template <typename T, typename LabelT>
+void CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel(
+    const GPUContext& dev_ctx,
+    const DenseTensor& label,
+    const DenseTensor& softmax,
+    const DenseTensor& loss_grad,
+    int axis,
+    DenseTensor* logits_grad) {
+  //   PADDLE_ENFORCE_EQ(
+  //       dev_ctx.GetPlace().GetType(),
+  //       phi::AllocationType::GPU,
+  //       common::errors::Unavailable("softmax_with_cross_entropy operator's "
+  //                                   "CUDA kernel only runs on GPU device."));
+
+  using LogitT = phi::bfloat16;
+  const T* loss_grad_data = loss_grad.data<T>();
+  DenseTensor* logit_grad = logits_grad;
+
+  LogitT* logit_grad_data = nullptr;
+  logit_grad_data = dev_ctx.template Alloc<LogitT>(logit_grad);
+
+  const int rank = logit_grad->dims().size();
+  const int axis_v = phi::funcs::CanonicalAxis(axis, rank);
+  int axis_dim = logit_grad->dims()[axis_v];
+
+  const int64_t n = phi::funcs::SizeToAxis(axis_v, logit_grad->dims());
+  const int64_t d = phi::funcs::SizeFromAxis(axis_v, logit_grad->dims());
+  const int64_t remain = d / axis_dim;
+
+  const T* softmax_data = softmax.data<T>();
+  const auto* label_data = label.data<LabelT>();
+
+  // Launch optimized kernel with automatic selection
+  LaunchOptimizedCrossEntropyGradKernel<T, LabelT, LogitT>(dev_ctx,
+                                                           logit_grad_data,
+                                                           loss_grad_data,
+                                                           softmax_data,
+                                                           label_data,
+                                                           n,
+                                                           axis_dim,
+                                                           remain,
+                                                           -100);
+}
+
+template <typename T, typename Context>
+void CrossEntropyWithSoftmaxBwdWithDowncastKernel(const Context& dev_ctx,
+                                                  const DenseTensor& label,
+                                                  const DenseTensor& softmax,
+                                                  const DenseTensor& loss_grad,
+                                                  DenseTensor* logits_grad) {
+  constexpr int axis = -1;
+  if (logits_grad->numel() == 0) {
+    dev_ctx.template Alloc<phi::bfloat16>(logits_grad);
+    return;
+  }
+  auto dtype = label.dtype();
+  PD_VISIT_INTEGRAL_TYPES(
+      dtype, "CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel", ([&] {
+        CrossEntropyWithSoftmaxBwdWithDowncastGPUKernel<T, data_t>(
+            dev_ctx, label, softmax, loss_grad, axis, logits_grad);
+      }));
+}
+
+}  // namespace phi
+
+PD_REGISTER_PLUGIN_KERNEL(cross_entropy_with_softmax_bwd_w_downcast,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::CrossEntropyWithSoftmaxBwdWithDowncastKernel,
+                          float,
+                          double,
+                          phi::float16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu
new file mode 100644
index 00000000000..6b20feee0fd
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/embedding_grad_add_to_kernel.cu
@@ -0,0 +1,27 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/embedding_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/embedding_grad.h"
+#include "paddle/phi/kernels/gpu/embedding_grad_add_to_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(embedding_grad_add_to,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::EmbeddingGradAddToAddToKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
new file mode 100644
index 00000000000..c6bd53f007f
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kernels/impl/gammaln_grad_kernel_impl.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/gammaln_grad_kernel.h"
+
+PD_REGISTER_PLUGIN_KERNEL(gammaln_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::GammalnGradKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu
new file mode 100644
index 00000000000..e6984cf86d2
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/moe_combine_no_weight_grad_kernel.cu
@@ -0,0 +1,25 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/legacy/gpu/moe_combine_no_weight_grad_kernel.cu"  // NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(moe_combine_no_weight_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::MoeCombineNoWeightGradKernel,
+                          float,
+                          double,
+                          phi::bfloat16,
+                          phi::float16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
new file mode 100644
index 00000000000..151c929e41c
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
@@ -0,0 +1,433 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <type_traits>
+
+#include "kernels/funcs/blas/blas.h"
+#include "paddle/common/errors.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T>
+__global__ void transpose(T *src,
+                          T *dst,
+                          const int batch_size,
+                          const int seq_len,
+                          const int head_num,
+                          const int size_per_head) {
+  int batch_id = blockIdx.x / (head_num * seq_len);
+  int seq_id = blockIdx.x % seq_len;
+  int head_id = (blockIdx.x % (head_num * seq_len)) / seq_len;
+  dst[batch_id * (head_num * seq_len * size_per_head) +
+      seq_id * head_num * size_per_head + head_id * size_per_head +
+      threadIdx.x] = src[blockIdx.x * size_per_head + threadIdx.x];
+}
+
+template <typename T>
+inline __device__ T add_func(T a, T b);
+
+template <>
+__device__ float add_func<float>(float a, float b) {
+  return a + b;
+}
+
+template <>
+__device__ float2 add_func<float2>(float2 a, float2 b) {
+  float2 c;
+  c.x = a.x + b.x;
+  c.y = a.y + b.y;
+  return c;
+}
+
+template <>
+__device__ float4 add_func<float4>(float4 a, float4 b) {
+  float4 c;
+  c.x = a.x + b.x;
+  c.y = a.y + b.y;
+  c.z = a.z + b.z;
+  c.w = a.w + b.w;
+  return c;
+}
+#if defined(PADDLE_WITH_CUDA)
+template <>
+__device__ half2 add_func<half2>(half2 a, half2 b) {
+#if __CUDA_ARCH__ >= 530
+  return __hadd2(a, b);
+#else
+  return half2(__float2half(__half2float(a.x) + __half2float(b.x)),
+               __float2half(__half2float(b.x) + __half2float(b.y)));
+#endif
+}
+
+template <>
+__device__ half add_func<half>(half a, half b) {
+#if __CUDA_ARCH__ >= 530
+  return __hadd(a, b);
+#else
+  return __float2half(__half2float(a) + __half2float(b));
+#endif
+}
+#endif
+
+template <typename T>
+__global__ void TransposeQkvKernel(const int H,
+                                   const T *input,
+                                   const T *bias,
+                                   T *output) {
+  // Input: BxSx3xNxH
+  // Bias: 3xNxH
+  // Output: 3xBxNxSxH
+  int n = threadIdx.y;
+  int s = blockIdx.x;
+  int b = blockIdx.y;
+  int m = blockIdx.z;
+
+  const int N = blockDim.y;
+  const int S = gridDim.x;
+  const int B = gridDim.y;
+
+  const int NH = N * H;
+  const int NHS = NH * S;
+  const int in_offset = n * H + m * NH + s * 3 * NH + b * NHS * 3;
+  const int bias_offset = m * NH + n * H;
+  const int out_offset = s * H + n * S * H + b * NHS + m * NHS * B;
+
+  const int i = threadIdx.x;
+  output[out_offset + i] =
+      add_func(input[in_offset + i], bias[bias_offset + i]);
+}
+
+template <typename T>
+void TransQKVWithBias(const int batch,
+                      const int seq_len,
+                      const int head_size,
+                      const int head_num,
+                      const T *input,
+                      const T *bias,
+                      T *output,
+                      gpuStream_t stream);
+
+template <>
+void TransQKVWithBias(const int batch,
+                      const int seq_len,
+                      const int head_size,
+                      const int head_num,
+                      const float *input,
+                      const float *bias,
+                      float *output,
+                      gpuStream_t stream) {
+  // BxSx3xNxH + 3xNxH -> 3xBxNxSxH
+  int scratch_size = batch * head_num * seq_len * seq_len;
+  const dim3 grid(seq_len, batch, 3);
+  // scratch % 4 == 0 to ensure the alignment
+  if (head_size % 4 == 0 && scratch_size % 4 == 0) {
+    const int h = head_size / 4;
+    const float4 *input4 = reinterpret_cast<const float4 *>(input);
+    const float4 *bias4 = reinterpret_cast<const float4 *>(bias);
+    float4 *output4 = reinterpret_cast<float4 *>(output);
+    const dim3 block(h, head_num, 1);
+
+    // limit h * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(h * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024 * 4));
+    TransposeQkvKernel<float4>
+        <<<grid, block, 0, stream>>>(h, input4, bias4, output4);
+  } else if (head_size % 2 == 0 && scratch_size % 2 == 0) {
+    const int h = head_size / 2;
+    const float2 *input2 = reinterpret_cast<const float2 *>(input);
+    const float2 *bias2 = reinterpret_cast<const float2 *>(bias);
+    float2 *output2 = reinterpret_cast<float2 *>(output);
+    const dim3 block(h, head_num, 1);
+    // limit h * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(h * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024 * 2));
+    TransposeQkvKernel<float2>
+        <<<grid, block, 0, stream>>>(h, input2, bias2, output2);
+  } else {
+    const dim3 block(head_size, head_num, 1);
+    // limit head_size * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(head_size * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024));
+    TransposeQkvKernel<float>
+        <<<grid, block, 0, stream>>>(head_size, input, bias, output);
+  }
+}
+
+#if defined(PADDLE_WITH_CUDA)
+template <>
+void TransQKVWithBias(const int batch,
+                      const int seq_len,
+                      const int head_size,
+                      const int head_num,
+                      const phi::float16 *input,
+                      const phi::float16 *bias,
+                      phi::float16 *output,
+                      gpuStream_t stream) {
+  // BxSx3xNxH + 3xNxH -> 3xBxNxSxH
+  int scratch_size = batch * head_num * seq_len * seq_len;
+  const dim3 grid(seq_len, batch, 3);
+  if (head_size % 2 == 0 && scratch_size % 2 == 0) {
+    const int h = head_size / 2;
+    const half2 *input2 = reinterpret_cast<const half2 *>(input);
+    const half2 *bias2 = reinterpret_cast<const half2 *>(bias);
+    half2 *output2 = reinterpret_cast<half2 *>(output);
+    const dim3 block(h, head_num, 1);
+    // limit h * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(h * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024 * 2));
+    TransposeQkvKernel<half2>
+        <<<grid, block, 0, stream>>>(h, input2, bias2, output2);
+  } else {
+    const dim3 block(head_size, head_num, 1);
+    const half *input_half = reinterpret_cast<const half *>(input);
+    const half *bias_half = reinterpret_cast<const half *>(bias);
+    half *output_half = reinterpret_cast<half *>(output);
+
+    // limit head_size * head_num to max block size(1024).
+    PADDLE_ENFORCE_LE(head_size * head_num,
+                      1024,
+                      common::errors::InvalidArgument(
+                          "head_num (%d) * head_size (%d) should <= %d",
+                          head_num,
+                          head_size,
+                          1024));
+    TransposeQkvKernel<half><<<grid, block, 0, stream>>>(
+        head_size, input_half, bias_half, output_half);
+  }
+}
+#endif
+
+inline int round_up(int seq_len, int multiple = 32) {
+  PADDLE_ENFORCE_GT(
+      multiple,
+      0,
+      common::errors::InvalidArgument(
+          "multiple should be a positive number, but it's (%d)", multiple));
+  return ((seq_len + multiple - 1) / multiple) * multiple;
+}
+
+template <typename T>
+__global__ void broadcast(const T *src,
+                          T *dst,
+                          const int seq_len,
+                          const int head_num) {
+  int batch_id = blockIdx.x / (head_num * seq_len);
+  int dst_offset = blockIdx.x * seq_len;
+  if (threadIdx.x < seq_len) {
+    dst[threadIdx.x + dst_offset] = src[threadIdx.x + batch_id * seq_len];
+  }
+}
+
+template <typename T>
+__global__ void broadcast_batch_head_number(const T *src,
+                                            T *dst,
+                                            const int batch_size,
+                                            const int seq_len,
+                                            const int head_num) {
+  int src_seq_id = blockIdx.x % seq_len;
+  int dst_offset = blockIdx.x * seq_len;
+  if (threadIdx.x < seq_len) {
+    dst[threadIdx.x + dst_offset] = src[threadIdx.x + src_seq_id * seq_len];
+  }
+}
+
+template <typename T, typename Context>
+void MultiheadMatmulKernel(const Context &dev_ctx,
+                           const DenseTensor &input,
+                           const DenseTensor &w,
+                           const DenseTensor &bias,
+                           const paddle::optional<DenseTensor> &bias_qk,
+                           const bool transpose_q,
+                           const bool transpose_k,
+                           const bool transpose_v,
+                           const float alpha,
+                           const int head_number,
+                           DenseTensor *out) {
+  auto *input_d = input.data<T>();
+  auto *w_d = w.data<T>();
+  auto *bias_d = bias.data<T>();
+  auto *bias_qk_d = bias_qk ? bias_qk->data<T>() : nullptr;
+  T scale = static_cast<T>(alpha);
+
+  // compute q*k with eltadd
+  auto stream = dev_ctx.stream();
+  // should be (B * S * hidden)
+  auto input_dims = input.dims();
+  // shouble be (hidden * 3 * all_head_size)
+  auto w_dims = w.dims();
+  int batch = input_dims[0];
+  int seq_len = input_dims[1];
+  int hidden = input_dims[2];
+  phi::DenseTensor temp_bias_tensor;
+  // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
+  if (bias_qk && bias_qk->numel() == (batch * seq_len)) {
+    VLOG(4) << "Do broadcasted bias_qk from [batch, 1, 1, seq_len]";
+    temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
+    auto *temp_qk_bias = dev_ctx.template Alloc<T>(
+        &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
+    int grid = batch * head_number * seq_len;
+    int block = round_up(seq_len);
+    broadcast<<<grid, block, 0, stream>>>(
+        bias_qk_d, temp_qk_bias, seq_len, head_number);
+    bias_qk_d = static_cast<const T *>(temp_qk_bias);
+  }
+  // if bias_qk is[1, 1, seq_len, seq_len], the bias_qk_d need to be
+  // broadcasted
+  if (bias_qk && bias_qk->numel() == (1 * seq_len * seq_len)) {
+    VLOG(4) << "do broadcasted bias_qk from  [1, 1, seq_len, seq_len]";
+    temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
+    auto *temp_qk_bias = dev_ctx.template Alloc<T>(
+        &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
+    int grid = batch * head_number * seq_len;
+    int block = round_up(seq_len);
+    broadcast_batch_head_number<<<grid, block, 0, stream>>>(
+        bias_qk_d, temp_qk_bias, batch, seq_len, head_number);
+    bias_qk_d = static_cast<const T *>(temp_qk_bias);
+  }
+  if (!bias_qk) {
+    int size = batch * head_number * seq_len * seq_len;
+    temp_bias_tensor.Resize({size});
+    auto *temp_qk_bias = dev_ctx.template Alloc<T>(
+        &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
+#ifdef PADDLE_WITH_HIP
+    hipMemset(temp_qk_bias, 0, sizeof(float) * size);
+#else
+    cudaMemset(temp_qk_bias, 0, sizeof(float) * size);
+#endif
+    bias_qk_d = static_cast<const T *>(temp_qk_bias);
+  }
+  int all_head_size = w_dims[2];
+  int head_size = all_head_size / head_number;
+
+  out->Resize({batch, seq_len, all_head_size});
+  auto *output_d = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+
+  // (B*S, hidden)
+  const phi::DenseTensor input_matrix =
+      phi::ReshapeToMatrix(input, 2 /*x_num_col_dims */);
+  // (hidden, 3 * all_head_size)
+  const phi::DenseTensor w_matrix =
+      phi::ReshapeToMatrix(w, 1 /*y_num_col_dims*/);
+
+  phi::DenseTensor temp_out_tensor;
+  auto temp_out_dims =
+      common::make_ddim({batch, seq_len, 3, head_number, head_size});
+  temp_out_tensor.Resize(
+      {batch * seq_len, common::product(temp_out_dims) / (batch * seq_len)});
+  auto *temp_out_data = dev_ctx.template Alloc<T>(
+      &temp_out_tensor, temp_out_tensor.numel() * sizeof(T));
+
+  // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
+  auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
+  blas.MatMul(input_matrix, w_matrix, &temp_out_tensor);
+  VLOG(2) << "(B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)";
+  // temp_out_tensor.Resize(temp_out_dims);
+
+  phi::DenseTensor multihead_temp_tensor;
+  // B * head_number * S * S * 1 + B * S * 3 * N * H
+  int scratch_size = batch * head_number * seq_len * seq_len * 1;
+  multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
+  auto *multihead_temp_data = dev_ctx.template Alloc<T>(
+      &multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T));
+
+  auto *qkptr = multihead_temp_data;
+  auto *tptr = multihead_temp_data + scratch_size;
+
+  // Do the transpose with bias.
+  // BxSx3xNxH => tptr: 3xBxNxSxH.
+  TransQKVWithBias(batch,
+                   seq_len,
+                   head_size,
+                   head_number,
+                   temp_out_data,
+                   bias_d,
+                   tptr,
+                   stream);
+  if (std::is_same<T, phi::float16>::value) {
+    phi::funcs::MultiheadGPUComputeFunctor<half> multihead_compute_func;
+    multihead_compute_func(dev_ctx,
+                           batch,
+                           seq_len,
+                           head_number,
+                           head_size,
+                           reinterpret_cast<half *>(qkptr),
+                           reinterpret_cast<const half *>(bias_qk_d),
+                           false,
+                           reinterpret_cast<half *>(tptr),
+                           __float2half(static_cast<float>(scale)),
+                           __float2half(0.0));
+  } else {
+    phi::funcs::MultiheadGPUComputeFunctor<T> multihead_compute_func;
+    multihead_compute_func(dev_ctx,
+                           batch,
+                           seq_len,
+                           head_number,
+                           head_size,
+                           qkptr,
+                           bias_qk_d,
+                           false,
+                           tptr,
+                           scale,
+                           T(0.0));
+  }
+
+  int grid = batch * head_number * seq_len;
+  int block = head_size;
+  transpose<T><<<grid, block, 0, stream>>>(
+      tptr, output_d, batch, seq_len, head_number, head_size);
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+#if defined(PADDLE_WITH_CUDA)
+PD_REGISTER_PLUGIN_KERNEL(multihead_matmul,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::MultiheadMatmulKernel,
+                          float,
+                          phi::float16) {}
+#else
+PD_REGISTER_PLUGIN_KERNEL(multihead_matmul,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::MultiheadMatmulKernel,
+                          float) {}
+#endif
diff --git a/backends/metax_gpu/kernels/funcs/generator.cc b/backends/metax_gpu/kernels/funcs/generator.cc
new file mode 100644
index 00000000000..8fcbf474b07
--- /dev/null
+++ b/backends/metax_gpu/kernels/funcs/generator.cc
@@ -0,0 +1,287 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/generator.h"
+
+#include <glog/logging.h>
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/xpu/xpu_info.h"
+#include "paddle/phi/core/enforce.h"
+
+static uint64_t GetRandomSeed() {
+  std::random_device rd;
+  // double has 53 bit significant, so limit uint64 to 53 bits
+  return ((((uint64_t)rd()) << 32) + rd()) & 0x1FFFFFFFFFFFFF;
+}
+
+namespace phi {
+
+const std::shared_ptr<Generator>& DefaultXPUGenerator(int64_t device_id) {
+#if defined(PADDLE_WITH_XPU)
+
+  static int64_t num_xpu_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> xpu_device_flags;
+  static std::vector<std::shared_ptr<Generator>> default_xpu_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_xpu_devices = phi::backends::xpu::GetXPUDeviceCount();
+    xpu_device_flags.resize(num_xpu_devices);
+    default_xpu_generators.resize(num_xpu_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(common::errors::InvalidArgument(
+        "xpu device id should be greater than 0"));
+  }
+
+  std::call_once(xpu_device_flags[device_id], [device_id]() {
+    default_xpu_generators[device_id] =
+        std::make_shared<Generator>(GetRandomSeed(), device_id);
+    VLOG(4) << "initial seed: "
+            << default_xpu_generators[device_id]->GetCurrentSeed();
+  });
+  return default_xpu_generators[device_id];
+#else
+  PADDLE_THROW(common::errors::PermissionDenied(
+      "getDefaultXPUGenerator only support in XPU place"));
+#endif
+}
+
+const std::shared_ptr<Generator>& DefaultCUDAGenerator(int64_t device_id) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+  static int64_t num_cuda_devices = -1;
+  static std::once_flag num_devices_init_flag;
+  static std::deque<std::once_flag> cuda_device_flags;
+  static std::vector<std::shared_ptr<Generator>> default_cuda_generators;
+
+  std::call_once(num_devices_init_flag, []() {
+    num_cuda_devices = phi::backends::gpu::GetGPUDeviceCount();
+    cuda_device_flags.resize(num_cuda_devices);
+    default_cuda_generators.resize(num_cuda_devices);
+  });
+  if (device_id < 0) {
+    PADDLE_THROW(common::errors::InvalidArgument(
+        "cuda device id should be greater than 0"));
+  }
+
+  std::call_once(cuda_device_flags[device_id], [device_id]() {
+    default_cuda_generators[device_id] =
+        std::make_shared<Generator>(GetRandomSeed(), device_id);
+    VLOG(7) << "initial seed: "
+            << default_cuda_generators[device_id]->GetCurrentSeed();
+  });
+  return default_cuda_generators[device_id];
+#else
+  PADDLE_THROW(common::errors::PermissionDenied(
+      "getDefaultCUDAGenerator only support in CUDA place"));
+#endif
+}
+
+const std::shared_ptr<Generator>& DefaultCPUGenerator() {
+  static auto default_cpu_generator =
+      std::make_shared<Generator>(GetRandomSeed());
+  return default_cpu_generator;
+}
+
+const std::shared_ptr<Generator>& DefaultCustomDeviceGenerator(
+    const phi::CustomPlace& place) {
+  static std::
+      unordered_map<phi::Place, std::shared_ptr<Generator>, phi::Place::Hash>
+          generators;
+  if (generators.find(place) == generators.end()) {
+    generators.insert({place, std::make_shared<Generator>(GetRandomSeed())});
+  }
+  return generators[place];
+}
+
+using RNGMap = std::unordered_map<std::string, std::shared_ptr<Generator>>;
+
+static RNGMap& GetRandomSeedGeneratorMap() {
+  static auto random_seed_generator_map = RNGMap();
+  return random_seed_generator_map;
+}
+
+const std::shared_ptr<Generator>& SetRandomSeedGenerator(
+    const std::string& name, uint64_t seed) {
+  auto& rng_map = GetRandomSeedGeneratorMap();
+  auto iter = rng_map.find(name);
+  PADDLE_ENFORCE_EQ(iter == rng_map.end(),
+                    true,
+                    common::errors::AlreadyExists(
+                        "%s RandomSeedGenerator is already exist", name));
+
+  auto generator = std::make_shared<Generator>(seed);
+  bool emplace_success = rng_map.emplace(name, generator).second;
+  PADDLE_ENFORCE_EQ(
+      emplace_success,
+      true,
+      common::errors::PermissionDenied(
+          "SetRandomSeedGenerator cannot emplace %s RandomSeedGenerator",
+          name));
+  return rng_map[name];
+}
+
+const std::shared_ptr<Generator>& GetRandomSeedGenerator(
+    const std::string& name) {
+  auto& rng_map = GetRandomSeedGeneratorMap();
+  auto iter = rng_map.find(name);
+  PADDLE_ENFORCE_EQ(iter != rng_map.end(),
+                    true,
+                    common::errors::NotFound(
+                        "%s RandomSeedGenerator is not found, please "
+                        "use `set_random_seed_generator` to set rng first",
+                        name));
+  return iter->second;
+}
+
+// There are 3 conditions:
+// (1) op seed is set, use op seed.
+// (2) op seed is not set, global seed is set, use global seed.
+// (3) op seed is not set, global seed is not set too, use random seed from
+// RandomGenerator.
+std::shared_ptr<std::mt19937_64> GetCPURandomEngine(uint64_t seed) {
+  if (seed == 0) {
+    VLOG(4) << "Use random cpu_engine from generator";
+    return DefaultCPUGenerator()->GetCPUEngine();
+  } else {
+    // NOTE(zhiqiu): creating an cpu_engine instance everytime instead of using
+    // OpDefaultCPUEngine(), this is the legacy behavior of random operators.
+    // The benefit is that when running PE with fixed-seed in multiple threads,
+    // each thread has their own cpu_engine, and doesn't affect each other.
+    //
+    // And we need to measure the determinacy of Generator in PE.
+    auto cpu_engine = std::make_shared<std::mt19937_64>();
+    static std::mutex mu_;
+    {
+      std::lock_guard<std::mutex> lock(mu_);
+      cpu_engine->seed(seed);
+    }
+    return cpu_engine;
+  }
+}
+
+inline void Generator::print_state_info() {
+  VLOG(7) << "Generator Random state "
+          << "device id: " << state().device << ", seed: " << state().seed
+          << ", offset: " << state().offset << ", cpu_engine: " << cpu_engine();
+}
+
+Generator::Generator() {
+  auto seed = GetRandomSeed();
+  current_index = states_.size();
+  states_.emplace_back(-1, seed);
+  print_state_info();
+}
+
+Generator::Generator(uint64_t seed) {
+  current_index = states_.size();
+  states_.emplace_back(-1, seed);
+  print_state_info();
+}
+
+Generator::Generator(uint64_t seed, int64_t device_id) {
+  current_index = states_.size();
+  // device id first, then seed
+  states_.emplace_back(device_id, seed);
+  print_state_info();
+}
+
+phi::Generator::GeneratorState Generator::GetState() { return state(); }
+
+void Generator::SetState(const phi::Generator::GeneratorState& state) {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (current_index < states_.size())
+    states_[current_index] = state;
+  else
+    PADDLE_THROW(common::errors::NotFound("Generator index is not found"));
+  print_state_info();
+}
+
+uint64_t Generator::GetStateIndex() { return current_index; }
+
+void Generator::SetStateIndex(uint64_t StateIndex) {
+  std::lock_guard<std::mutex> lock(mu_);
+  if (current_index < states_.size())
+    current_index = StateIndex;
+  else
+    PADDLE_THROW(common::errors::NotFound("Generator index is not found"));
+}
+
+uint64_t Generator::RegisterStateIndex(const GeneratorState& state) {
+  std::lock_guard<std::mutex> lock(mu_);
+  auto new_index = states_.size();
+  states_.push_back(state);
+  current_index = new_index;
+  return new_index;
+}
+
+inline Generator::GeneratorState& Generator::state() {
+  if (current_index < states_.size())
+    return states_[current_index];
+  else
+    PADDLE_THROW(common::errors::NotFound("Generator index is not found"));
+}
+
+inline std::shared_ptr<std::mt19937_64> Generator::cpu_engine() {
+  return state().cpu_engine;
+}
+
+uint64_t Generator::GetCurrentSeed() {
+  std::lock_guard<std::mutex> lock(mu_);
+  return state().seed;
+}
+
+uint64_t Generator::Seed() {
+  std::lock_guard<std::mutex> lock(mu_);
+  uint64_t seed = GetRandomSeed();
+  state().reset(seed);
+  return seed;
+}
+
+void Generator::SetCurrentSeed(uint64_t seed) {
+  std::lock_guard<std::mutex> lock(mu_);
+  state().reset(seed);
+}
+
+std::shared_ptr<std::mt19937_64> Generator::GetCPUEngine() {
+  return cpu_engine();
+}
+
+uint64_t Generator::Random64() {
+  std::lock_guard<std::mutex> lock(mu_);
+  auto current_engine = cpu_engine();
+  return (*current_engine)();
+}
+
+std::pair<uint64_t, uint64_t> Generator::IncrementOffset(uint64_t increment) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
+    defined(PADDLE_WITH_CUSTOM_DEVICE) || defined(PADDLE_WITH_XPU)
+  std::lock_guard<std::mutex> lock(mu_);
+  uint64_t offset = state().offset;
+  state().offset = offset + increment;
+  print_state_info();
+  return std::make_pair(state().seed, offset);
+#else
+  PADDLE_THROW(common::errors::PermissionDenied(
+      "Increment Offset only support in CUDA place"));
+#endif
+}
+
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
new file mode 100644
index 00000000000..2b222ba3b2c
--- /dev/null
+++ b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
@@ -0,0 +1,112 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/kernels/funcs/for_range.h"
+
+namespace phi {
+template <typename T>
+HOSTDEVICE T digamma_positive_domain(T x) {
+  constexpr T c = T{8.5};
+  constexpr T euler_mascheroni = T{0.57721566490153286060};
+  T r;
+  T value;
+  T x2;
+
+  if (x <= T{0.000001}) {
+    value = -euler_mascheroni - T{1.0} / x + T{1.6449340668482264365} * x;
+    return value;
+  }
+
+  value = T{0.0};
+  x2 = x;
+  while (x2 < c) {
+    value = value - T{1.0} / x2;  // NOLINT
+    x2 = x2 + T{1.0};
+  }
+
+  r = T{1.0} / x2;
+  value = value + std::log(x2) - T{0.5} * r;
+
+  r = r * r;
+
+  value = value -
+          r * (T{1.0} / T{12.0} -
+               r * (T{1.0} / T{120.0} -
+                    r * (T{1.0} / T{252.0} -
+                         r * (T{1.0} / T{240.0} - r * (T{1.0} / T{132.0})))));
+
+  return value;
+}
+
+template <typename T>
+HOSTDEVICE T digamma(T x) {
+  const static T pi = T{3.14159265358979323846};  // NOLINT
+
+  if (x == T{0.0}) {
+    T inf = std::numeric_limits<T>::infinity();
+    return std::signbit(x) ? inf : -inf;
+  } else if (x < T{0.0}) {
+    if (x == std::trunc(x)) {
+      return std::numeric_limits<T>::quiet_NaN();
+    } else {
+      T iptr;
+      T frac_part = std::modf(x, &iptr);
+      return digamma_positive_domain(T{1.0} - x) -
+             pi / std::tan(pi * frac_part);
+    }
+  } else {
+    return digamma_positive_domain(x);
+  }
+}
+
+template <typename T>
+struct GammalnGradFunctor {
+  GammalnGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
+      : dout_(dout), x_(x), output_(output), numel_(numel) {}
+
+  HOSTDEVICE void operator()(int64_t idx) const {
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    const MT mp_dout = static_cast<MT>(dout_[idx]);
+    const MT mp_x = static_cast<MT>(x_[idx]);
+    output_[idx] = static_cast<T>(mp_dout * digamma<MT>(mp_x));
+  }
+
+ private:
+  const T* dout_;
+  const T* x_;
+  T* output_;
+  int64_t numel_;
+};
+template <typename T, typename Context>
+void GammalnGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& d_out,
+                       DenseTensor* d_x) {
+  auto numel = d_out.numel();
+  if (d_x && d_x->numel() == 0) {
+    dev_ctx.template Alloc<T>(d_x);
+    return;
+  }
+  auto* dout_data = d_out.data<T>();
+  auto* x_data = x.data<T>();
+  auto* dx_data =
+      dev_ctx.template Alloc<T>(d_x, static_cast<size_t>(numel * sizeof(T)));
+  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
+  GammalnGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
+  for_range(functor);
+}
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu
new file mode 100644
index 00000000000..766d984a25b
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_grad_kernel.cu
@@ -0,0 +1,362 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cudnn_lstm_grad_kernel.h"
+#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CudnnLSTMGradKernel(
+    const Context &dev_ctx,
+    const DenseTensor &x,
+    const DenseTensor &init_h,
+    const DenseTensor &init_c,
+    const paddle::optional<std::vector<const DenseTensor *>> &weight_list,
+    const paddle::optional<DenseTensor> &sequence_length,
+    const DenseTensor &out,
+    const DenseTensor &reserve,
+    const DenseTensor &state_out,
+    const DenseTensor &out_grad,
+    const DenseTensor &last_h_grad,
+    const DenseTensor &last_c_grad,
+    float dropout_prob,
+    bool is_bidirec,
+    int hidden_size,
+    int num_layers,
+    bool is_test,
+    int seed,
+    DenseTensor *x_grad,
+    DenseTensor *init_h_grad,
+    DenseTensor *init_c_grad,
+    std::vector<DenseTensor *> weight_grad_list) {
+  auto input_dims = x.dims();
+  auto init_h_dims = init_h.dims();
+  auto init_c_dims = init_c.dims();
+
+  auto *init_h_data = init_h.data<T>();
+  auto *init_c_data = init_c.data<T>();
+  auto *out_data = out.data<T>();
+  auto *out_grad_data = out_grad.data<T>();
+  auto *last_h_grad_data = last_h_grad.data<T>();
+  auto *last_c_grad_data = last_c_grad.data<T>();
+
+  auto running_weight_list = *weight_list.get_ptr();
+  int weight_numel = size_sum(running_weight_list);
+  bool continuous = is_continuous<T, std::vector<const phi::DenseTensor *>>(
+      running_weight_list);
+
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+  auto place = dev_ctx.GetPlace();
+  auto stream = dev_ctx.stream();
+  phi::DenseTensor weight_whole;
+  T *weight_data = nullptr;
+
+  if (!continuous) {
+    weight_whole.Resize({weight_numel});
+    dev_ctx.template Alloc<T>(&weight_whole);
+    weight_to_tensor<T>(place, stream, running_weight_list, &weight_whole);
+    weight_data = weight_whole.data<T>();
+  } else {
+    weight_data = const_cast<T *>(running_weight_list[0]->data<T>());
+  }
+
+  phi::DenseTensor weight_grad;
+  phi::funcs::SetConstant<phi::GPUContext, T> zero;
+  weight_grad.Resize({weight_numel});
+  dev_ctx.template Alloc<T>(&weight_grad);
+  zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
+  T *weight_grad_data = weight_grad.data<T>();
+
+  int offset = 0;
+  for (size_t i = 0; i < weight_grad_list.size(); ++i) {
+    size_t len = weight_grad_list[i]->numel();
+    auto dim = weight_grad_list[i]->dims();
+    weight_grad_list[i]
+        ->ShareDataWith(weight_grad.Slice(static_cast<int64_t>(offset),
+                                          static_cast<int64_t>(offset + len)))
+        .Resize(dim);
+    offset += len;
+  }
+
+  x_grad->Resize(input_dims);
+  dev_ctx.template Alloc<T>(x_grad);
+  auto *in_grad_data = x_grad->data<T>();
+
+  if (init_h_grad) {
+    init_h_grad->Resize(init_h_dims);
+    dev_ctx.template Alloc<T>(init_h_grad);
+  }
+  auto *init_h_grad_data = init_h_grad ? init_h_grad->data<T>() : nullptr;
+
+  if (init_c_grad) {
+    init_c_grad->Resize(init_c_dims);
+    dev_ctx.template Alloc<T>(init_c_grad);
+  }
+  auto *init_c_grad_data = init_c_grad ? init_c_grad->data<T>() : nullptr;
+
+  auto running_seq_length = sequence_length.get_ptr();
+  bool has_seq_length = running_seq_length != nullptr;
+  std::vector<int> SequenceLength;
+  if (has_seq_length) {
+    SequenceLength = phi::GetVectorFromTensor<int>(running_seq_length);
+  }
+
+  int seq_length = input_dims[0];
+  int batch_size = x.dims()[1];
+  int input_size = x.dims()[2];
+
+  size_t workspace_size;
+  size_t reserve_size;
+
+  ScopedRNNBase rnn(seq_length,
+                    batch_size,
+                    input_size,
+                    hidden_size,
+                    num_layers,
+                    dropout_prob,
+                    seed,
+                    weight_numel,
+                    true,
+                    is_bidirec);
+
+  rnn.Create<T>(handle,
+                dev_ctx.GetPlace(),
+                SequenceLength,
+                &workspace_size,
+                &reserve_size,
+                const_cast<phi::DenseTensor *>(&state_out));
+
+  phi::DenseTensor workspace_data_;
+  workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
+  dev_ctx.template Alloc<uint8_t>(&workspace_data_);
+  const uint8_t *reserve_data = reserve.data<uint8_t>();
+
+#if CUDNN_VERSION >= 90000
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardData_v8(
+      handle,
+      rnn.rnn_desc(),
+      nullptr,
+      rnn.y_seq_desc(),
+      out_data,
+      out_grad_data,
+      rnn.x_seq_desc(),
+      in_grad_data,
+      rnn.init_h_desc(),
+      init_h_data,
+      last_h_grad_data,
+      init_h_grad_data,
+      rnn.init_c_desc(),
+      init_c_data,
+      last_c_grad_data,
+      init_c_grad_data,
+      rnn.weights_size(),
+      weight_data,
+      workspace_size,
+      workspace_data_.data<uint8_t>(),
+      reserve_size,
+      const_cast<uint8_t *>(reserve_data)));
+
+  PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights_v8(
+      handle,
+      rnn.rnn_desc(),
+      CUDNN_WGRAD_MODE_ADD,
+      nullptr,
+      rnn.x_seq_desc(),
+      x.data<T>(),
+      rnn.init_h_desc(),
+      init_h.data<T>(),
+      rnn.y_seq_desc(),
+      out.data<T>(),
+      rnn.weights_size(),
+      weight_grad_data,
+      workspace_size,
+      workspace_data_.data<uint8_t>(),
+      reserve_size,
+      const_cast<uint8_t *>(reserve_data)));
+#else
+
+  if (!has_seq_length) {
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenRNNBackwardData(handle,
+                                            rnn.rnn_desc(),
+                                            seq_length,
+                                            rnn.y_descs(),
+                                            out_data,
+                                            rnn.y_descs(),
+                                            out_grad_data,
+                                            rnn.last_h_desc(),
+                                            last_h_grad_data,
+                                            rnn.last_c_desc(),
+                                            last_c_grad_data,
+                                            rnn.weight_desc(),
+                                            weight_data,
+                                            rnn.init_h_desc(),
+                                            init_h_data,
+                                            rnn.init_c_desc(),
+                                            init_c_data,
+                                            rnn.x_descs(),
+                                            in_grad_data,
+                                            rnn.init_h_desc(),
+                                            init_h_grad_data,
+                                            rnn.init_c_desc(),
+                                            init_c_grad_data,
+                                            workspace_data_.data<uint8_t>(),
+                                            workspace_size,
+                                            const_cast<uint8_t *>(reserve_data),
+                                            reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNBackwardWeights(
+        handle,
+        rnn.rnn_desc(),
+        seq_length,
+        rnn.x_descs(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_descs(),
+        out.data<T>(),
+        rnn.weight_desc(),
+        weight_grad_data,
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnRNNBackwardData(handle,
+                                           rnn.rnn_desc(),
+                                           seq_length,
+                                           rnn.y_descs(),
+                                           out_data,
+                                           rnn.y_descs(),
+                                           out_grad_data,
+                                           rnn.last_h_desc(),
+                                           last_h_grad_data,
+                                           rnn.last_c_desc(),
+                                           last_c_grad_data,
+                                           rnn.weight_desc(),
+                                           weight_data,
+                                           rnn.init_h_desc(),
+                                           init_h_data,
+                                           rnn.init_c_desc(),
+                                           init_c_data,
+                                           rnn.x_descs(),
+                                           in_grad_data,
+                                           rnn.init_h_desc(),
+                                           init_h_grad_data,
+                                           rnn.init_c_desc(),
+                                           init_c_grad_data,
+                                           workspace_data_.data<uint8_t>(),
+                                           workspace_size,
+                                           const_cast<uint8_t *>(reserve_data),
+                                           reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeights(
+        handle,
+        rnn.rnn_desc(),
+        seq_length,
+        rnn.x_descs(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_descs(),
+        out.data<T>(),
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        rnn.weight_desc(),
+        weight_grad_data,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#endif
+  } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+    // for train
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardDataEx(
+        handle,
+        rnn.rnn_desc(),
+        rnn.y_seq_desc(),
+        out_data,
+        rnn.y_seq_desc(),
+        out_grad_data,
+        nullptr,
+        nullptr,
+        rnn.last_h_desc(),
+        last_h_grad_data,
+        rnn.last_c_desc(),
+        last_c_grad_data,
+        rnn.weight_desc(),
+        weight_data,
+        rnn.init_h_desc(),
+        init_h_data,
+        rnn.init_c_desc(),
+        init_c_data,
+        rnn.x_seq_desc(),
+        in_grad_data,
+        rnn.init_h_desc(),
+        init_h_grad_data,
+        rnn.init_c_desc(),
+        init_c_grad_data,
+        nullptr,
+        nullptr,
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNBackwardWeightsEx(
+        handle,
+        rnn.rnn_desc(),
+        rnn.x_seq_desc(),
+        x.data<T>(),
+        rnn.init_h_desc(),
+        init_h.data<T>(),
+        rnn.y_seq_desc(),
+        out.data<T>(),
+        workspace_data_.data<uint8_t>(),
+        workspace_size,
+        rnn.weight_desc(),
+        weight_grad_data,
+        const_cast<uint8_t *>(reserve_data),
+        reserve_size));
+#else
+    PADDLE_THROW(common::errors::Unavailable(
+        "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
+        "cudnnRNNBackwardWeightsEx, but it only works when the version "
+        "of cudnn is larger than 7.2.1"));
+#endif
+  }
+
+#endif  // end CUDNN_VERSION >= 90000
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(
+    cudnn_lstm_grad, GPU, ALL_LAYOUT, phi::CudnnLSTMGradKernel, float) {}
+#else
+PD_REGISTER_PLUGIN_KERNEL(cudnn_lstm_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::CudnnLSTMGradKernel,
+                          float,
+                          double) {}
+#endif
diff --git a/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu
new file mode 100644
index 00000000000..6bb94c9281a
--- /dev/null
+++ b/backends/metax_gpu/kernels/metax_kernel/cudnn_lstm_kernel.cu
@@ -0,0 +1,428 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "glog/logging.h"
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cudnn_lstm_kernel.h"
+#include "paddle/phi/kernels/gpu/cudnn_lstm_utils.h"
+
+namespace phi {
+
+template <typename T>
+#ifdef PADDLE_WITH_HIP
+void LSTMInference(const bool &has_seq_length,
+                   const miopenHandle_t &handle,
+#else
+void LSTMInference(const bool &has_seq_length,
+                   const cudnnHandle_t &handle,
+#endif
+                   const int &seq_length,
+                   ScopedRNNBase *rnn,
+                   const T *x_data,
+                   const T *init_h_data,
+                   const T *init_c_data,
+                   const T *w_data,
+                   T *out_data,
+                   T *last_h_data,
+                   T *last_c_data,
+                   phi::DenseTensor *workspace_data,
+                   const size_t &workspace_size) {
+#if CUDNN_VERSION >= 90000
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnRNNForward(handle,
+                                    rnn->rnn_desc(),
+                                    CUDNN_FWD_MODE_INFERENCE,
+                                    nullptr,
+                                    rnn->x_seq_desc(),
+                                    x_data,
+                                    rnn->y_seq_desc(),
+                                    out_data,
+                                    rnn->init_h_desc(),
+                                    init_h_data,
+                                    last_h_data,
+                                    rnn->init_c_desc(),
+                                    init_c_data,
+                                    last_c_data,
+                                    rnn->weights_size(),
+                                    w_data,
+                                    workspace_size,
+                                    workspace_data->data<uint8_t>(),
+                                    0,
+                                    nullptr));
+
+#else
+
+  if (!has_seq_length) {
+// for inference
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::miopenRNNForwardInference(handle,
+                                                rnn->rnn_desc(),
+                                                seq_length,
+                                                rnn->x_descs(),
+                                                x_data,
+                                                rnn->init_h_desc(),
+                                                init_h_data,
+                                                rnn->init_c_desc(),
+                                                init_c_data,
+                                                rnn->weight_desc(),
+                                                w_data,
+                                                rnn->y_descs(),
+                                                out_data,
+                                                rnn->last_h_desc(),
+                                                last_h_data,
+                                                rnn->last_c_desc(),
+                                                last_c_data,
+                                                workspace_data->data<uint8_t>(),
+                                                workspace_size));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnRNNForwardInference(handle,
+                                               rnn->rnn_desc(),
+                                               seq_length,
+                                               rnn->x_descs(),
+                                               x_data,
+                                               rnn->init_h_desc(),
+                                               init_h_data,
+                                               rnn->init_c_desc(),
+                                               init_c_data,
+                                               rnn->weight_desc(),
+                                               w_data,
+                                               rnn->y_descs(),
+                                               out_data,
+                                               rnn->last_h_desc(),
+                                               last_h_data,
+                                               rnn->last_c_desc(),
+                                               last_c_data,
+                                               workspace_data->data<uint8_t>(),
+                                               workspace_size));
+#endif
+  } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+    // for inference
+    // This interface is used when the input/output is padded.
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardInferenceEx(
+        handle,
+        rnn->rnn_desc(),
+        rnn->x_seq_desc(),
+        x_data,
+        rnn->init_h_desc(),
+        init_h_data,
+        rnn->init_c_desc(),
+        init_c_data,
+        rnn->weight_desc(),
+        w_data,
+        rnn->y_seq_desc(),
+        out_data,
+        rnn->last_h_desc(),
+        last_h_data,
+        rnn->last_c_desc(),
+        last_c_data,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        nullptr,
+        workspace_data->data<uint8_t>(),
+        workspace_size));
+#else
+    // CUDNN VERSION has to >=7.2.1
+    PADDLE_THROW(common::errors::Unavailable(
+        "The padded input is supported by "
+        "cudnnRNNForwardInferenceEx, but it only works when "
+        "the version of cudnn is larger than 7.2.1"));
+#endif
+  }
+
+#endif  // end CUDNN_VERSION >= 90000
+}
+
+template <typename T, typename Context>
+void CudnnLSTMKernel(
+    const Context &dev_ctx,
+    const DenseTensor &x,
+    const DenseTensor &init_h,
+    const DenseTensor &init_c,
+    const paddle::optional<DenseTensor> &w,
+    const paddle::optional<std::vector<const DenseTensor *>> &weight_list,
+    const paddle::optional<DenseTensor> &sequence_length,
+    float dropout_prob,
+    bool is_bidirec,
+    int hidden_size,
+    int num_layers,
+    bool is_test,
+    int seed,
+    DenseTensor *out,
+    DenseTensor *last_h,
+    DenseTensor *last_c,
+    DenseTensor *reserve,
+    DenseTensor *state_out) {
+  const T *x_data = x.data<T>();
+  const T *init_h_data = init_h.data<T>();
+  const T *init_c_data = init_c.data<T>();
+
+  T *out_data = dev_ctx.template Alloc<T>(out);
+  T *last_h_data = dev_ctx.template Alloc<T>(last_h);
+  T *last_c_data = dev_ctx.template Alloc<T>(last_c);
+
+  if (!is_test) {
+    if (seed == 0) {
+      // If not specify seed, use global Generator to generate seed.
+      int device_id = dev_ctx.GetPlace().GetDeviceId();
+      auto gen_cuda = phi::DefaultCUDAGenerator(device_id);
+      seed = static_cast<int>(gen_cuda->Random64());
+    }
+  }
+
+  auto *running_sequence_length = sequence_length.get_ptr();
+  bool has_seq_length = running_sequence_length != nullptr;
+  std::vector<int> SequenceLength;
+  if (has_seq_length) {
+    SequenceLength = phi::GetVectorFromTensor<int>(running_sequence_length);
+  }
+
+  // auto handle = dev_ctx.cudnn_handle();
+  auto handle = GetDnnHandle(dev_ctx.stream(), dev_ctx.GetPlace());
+
+  int seq_length = x.dims()[0];
+  int batch_size = x.dims()[1];
+  int input_size = x.dims()[2];
+  bool state_initialized = state_out->initialized() ? true : false;
+
+  size_t workspace_size;
+  size_t reserve_size;
+  phi::DenseTensor weight_whole;
+  T *w_data = nullptr;
+  int weight_numel;
+  bool w_initialized = false;
+  auto place = dev_ctx.GetPlace();
+  auto stream = dev_ctx.stream();
+  auto *running_w = w.get_ptr();
+  if (is_test && running_w != nullptr) {
+    w_initialized = running_w->initialized() ? true : false;
+    weight_numel = running_w->numel();
+  }
+  if (!w_initialized) {
+    auto running_weight_list = *weight_list.get_ptr();
+    bool continuous = is_continuous<T, std::vector<const phi::DenseTensor *>>(
+        running_weight_list);
+    weight_numel = size_sum(running_weight_list);
+
+    if (!continuous) {
+      LOG_FIRST_N(WARNING, 2)
+          << "If the memory space of the Input WeightList is not continuous, "
+             "less efficient calculation will be called. Please call "
+             "flatten_parameters() to make the input memory continuous.";
+      weight_whole.Resize({weight_numel});
+      dev_ctx.template Alloc<T>(&weight_whole);
+      weight_to_tensor<T>(place, stream, running_weight_list, &weight_whole);
+      w_data = weight_whole.data<T>();
+      if (is_test) {  // maybe also reset small weights' ptr for training
+        int offset = 0;
+        for (size_t i = 0; i < running_weight_list.size(); ++i) {
+          size_t len = running_weight_list[i]->numel();
+          auto dim = running_weight_list[i]->dims();
+          const_cast<phi::DenseTensor *>(running_weight_list[i])
+              ->ShareDataWith(
+                  weight_whole.Slice(static_cast<int64_t>(offset),
+                                     static_cast<int64_t>(offset + len)))
+              .Resize(dim);
+          offset += len;
+        }
+      }
+    } else {
+      w_data = const_cast<T *>(running_weight_list[0]->data<T>());
+    }
+  } else {
+    w_data = const_cast<T *>(running_w->data<T>());
+  }
+
+  ScopedRNNBase rnn(seq_length,
+                    batch_size,
+                    input_size,
+                    hidden_size,
+                    num_layers,
+                    dropout_prob,
+                    seed,
+                    weight_numel,
+                    state_initialized,
+                    is_bidirec);
+  rnn.Create<T>(handle,
+                dev_ctx.GetPlace(),
+                SequenceLength,
+                &workspace_size,
+                &reserve_size,
+                state_out);
+
+  phi::DenseTensor workspace_data_;
+  workspace_data_.Resize({static_cast<int64_t>(workspace_size)});
+  dev_ctx.template Alloc<uint8_t>(&workspace_data_);
+
+  reserve->Resize({static_cast<int64_t>(reserve_size)});
+  auto *reserve_data = dev_ctx.template Alloc<uint8_t>(reserve);
+
+  if (is_test) {
+    LSTMInference<T>(has_seq_length,
+                     handle,
+                     seq_length,
+                     &rnn,
+                     x_data,
+                     init_h_data,
+                     init_c_data,
+                     w_data,
+                     out_data,
+                     last_h_data,
+                     last_c_data,
+                     &workspace_data_,
+                     workspace_size);
+  } else {
+#if CUDNN_VERSION >= 90000
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        phi::dynload::cudnnRNNForward(handle,
+                                      rnn.rnn_desc(),
+                                      CUDNN_FWD_MODE_TRAINING,
+                                      nullptr,
+                                      rnn.x_seq_desc(),
+                                      x_data,
+                                      rnn.y_seq_desc(),
+                                      out_data,
+                                      rnn.init_h_desc(),
+                                      init_h_data,
+                                      last_h_data,
+                                      rnn.init_c_desc(),
+                                      init_c_data,
+                                      last_c_data,
+                                      rnn.weights_size(),
+                                      w_data,
+                                      workspace_size,
+                                      workspace_data_.data<uint8_t>(),
+                                      reserve_size,
+                                      reserve_data));
+#else
+
+    if (!has_seq_length) {
+// for train
+// This interface is used when the input/output is unpadded.
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenRNNForwardTraining(
+          handle,
+          rnn.rnn_desc(),
+          seq_length,
+          rnn.x_descs(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_descs(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          phi::dynload::cudnnRNNForwardTraining(handle,
+                                                rnn.rnn_desc(),
+                                                seq_length,
+                                                rnn.x_descs(),
+                                                x_data,
+                                                rnn.init_h_desc(),
+                                                init_h_data,
+                                                rnn.init_c_desc(),
+                                                init_c_data,
+                                                rnn.weight_desc(),
+                                                w_data,
+                                                rnn.y_descs(),
+                                                out_data,
+                                                rnn.last_h_desc(),
+                                                last_h_data,
+                                                rnn.last_c_desc(),
+                                                last_c_data,
+                                                workspace_data_.data<uint8_t>(),
+                                                workspace_size,
+                                                reserve_data,
+                                                reserve_size));
+#endif
+    } else {
+#if !defined(PADDLE_WITH_HIP) && CUDNN_VERSION >= 7201
+      // for train
+      // This interface is used when the input/output is padded.
+      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnRNNForwardTrainingEx(
+          handle,
+          rnn.rnn_desc(),
+          rnn.x_seq_desc(),
+          x_data,
+          rnn.init_h_desc(),
+          init_h_data,
+          rnn.init_c_desc(),
+          init_c_data,
+          rnn.weight_desc(),
+          w_data,
+          rnn.y_seq_desc(),
+          out_data,
+          rnn.last_h_desc(),
+          last_h_data,
+          rnn.last_c_desc(),
+          last_c_data,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          nullptr,
+          workspace_data_.data<uint8_t>(),
+          workspace_size,
+          reserve_data,
+          reserve_size));
+#else
+      PADDLE_THROW(common::errors::Unavailable(
+          "The padded input is supported by "
+          "cudnnRNNForwardTrainingEx, but it only works when "
+          "the version of cudnn is larger than 7.2.1"));
+#endif
+    }
+#endif  // end CUDNN_VERSION >= 90000
+  }
+}
+
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(cudnn_lstm, GPU, ALL_LAYOUT, phi::CudnnLSTMKernel, float) {
+  kernel->InputAt(5).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::UINT8);
+  kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
+}
+#else
+PD_REGISTER_PLUGIN_KERNEL(
+    cudnn_lstm, metax_gpu, ALL_LAYOUT, phi::CudnnLSTMKernel, float, double) {
+  kernel->InputAt(5).SetDataType(phi::DataType::INT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::UINT8);
+  kernel->OutputAt(4).SetDataType(phi::DataType::UINT8);
+}
+#endif
diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
index b4f1afbe5b0..4e54e17b3ef 100644
--- a/backends/metax_gpu/tests/ignore.txt
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -19,3 +19,7 @@ test_uniform_random_op
 test_c_embedding_op
 test_slice_op
 test_compare_op
+test_conv3d_transpose_op
+test_conv3d_layer
+test_conv3d_transpose_part2_op
+test_fused_conv2d_add_act_op

From cccf6b7e68cbaedd28c666773020d094556ab251 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 14:12:32 +0800
Subject: [PATCH 61/95] [metax] rename yaml file (#77)

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

---------
---
 .github/workflows/metax_work.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index aff530d475c..f14023848c6 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -1,4 +1,4 @@
-name: padlle metax gpu test
+name: paddle metax gpu test
 
 on:
   workflow_dispatch:

From 7a7a7a0590eb0b61be1bd7a911f37dfd521cc2ec Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 14:30:16 +0800
Subject: [PATCH 62/95] [metax] rm file (#78)

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

* [metax] rm file

* [metax] rm file

---------
---
 .../cuda_kernels/gammaln_grad_kernel.cu       |  28 -----
 .../kernels/impl/gammaln_grad_kernel_impl.h   | 112 ------------------
 .../kernels/metax_kernel/rnn_kernel.cu.cc     |   2 +
 3 files changed, 2 insertions(+), 140 deletions(-)
 delete mode 100644 backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
 delete mode 100644 backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h

diff --git a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
deleted file mode 100644
index c6bd53f007f..00000000000
--- a/backends/metax_gpu/kernels/cuda_kernels/gammaln_grad_kernel.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "kernels/impl/gammaln_grad_kernel_impl.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/gammaln_grad_kernel.h"
-
-PD_REGISTER_PLUGIN_KERNEL(gammaln_grad,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::GammalnGradKernel,
-                          float,
-                          double,
-                          phi::float16,
-                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
deleted file mode 100644
index 2b222ba3b2c..00000000000
--- a/backends/metax_gpu/kernels/impl/gammaln_grad_kernel_impl.h
+++ /dev/null
@@ -1,112 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
-
-namespace phi {
-template <typename T>
-HOSTDEVICE T digamma_positive_domain(T x) {
-  constexpr T c = T{8.5};
-  constexpr T euler_mascheroni = T{0.57721566490153286060};
-  T r;
-  T value;
-  T x2;
-
-  if (x <= T{0.000001}) {
-    value = -euler_mascheroni - T{1.0} / x + T{1.6449340668482264365} * x;
-    return value;
-  }
-
-  value = T{0.0};
-  x2 = x;
-  while (x2 < c) {
-    value = value - T{1.0} / x2;  // NOLINT
-    x2 = x2 + T{1.0};
-  }
-
-  r = T{1.0} / x2;
-  value = value + std::log(x2) - T{0.5} * r;
-
-  r = r * r;
-
-  value = value -
-          r * (T{1.0} / T{12.0} -
-               r * (T{1.0} / T{120.0} -
-                    r * (T{1.0} / T{252.0} -
-                         r * (T{1.0} / T{240.0} - r * (T{1.0} / T{132.0})))));
-
-  return value;
-}
-
-template <typename T>
-HOSTDEVICE T digamma(T x) {
-  const static T pi = T{3.14159265358979323846};  // NOLINT
-
-  if (x == T{0.0}) {
-    T inf = std::numeric_limits<T>::infinity();
-    return std::signbit(x) ? inf : -inf;
-  } else if (x < T{0.0}) {
-    if (x == std::trunc(x)) {
-      return std::numeric_limits<T>::quiet_NaN();
-    } else {
-      T iptr;
-      T frac_part = std::modf(x, &iptr);
-      return digamma_positive_domain(T{1.0} - x) -
-             pi / std::tan(pi * frac_part);
-    }
-  } else {
-    return digamma_positive_domain(x);
-  }
-}
-
-template <typename T>
-struct GammalnGradFunctor {
-  GammalnGradFunctor(const T* dout, const T* x, T* output, int64_t numel)
-      : dout_(dout), x_(x), output_(output), numel_(numel) {}
-
-  HOSTDEVICE void operator()(int64_t idx) const {
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    const MT mp_dout = static_cast<MT>(dout_[idx]);
-    const MT mp_x = static_cast<MT>(x_[idx]);
-    output_[idx] = static_cast<T>(mp_dout * digamma<MT>(mp_x));
-  }
-
- private:
-  const T* dout_;
-  const T* x_;
-  T* output_;
-  int64_t numel_;
-};
-template <typename T, typename Context>
-void GammalnGradKernel(const Context& dev_ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& d_out,
-                       DenseTensor* d_x) {
-  auto numel = d_out.numel();
-  if (d_x && d_x->numel() == 0) {
-    dev_ctx.template Alloc<T>(d_x);
-    return;
-  }
-  auto* dout_data = d_out.data<T>();
-  auto* x_data = x.data<T>();
-  auto* dx_data =
-      dev_ctx.template Alloc<T>(d_x, static_cast<size_t>(numel * sizeof(T)));
-  phi::funcs::ForRange<Context> for_range(dev_ctx, numel);
-  GammalnGradFunctor<T> functor(dout_data, x_data, dx_data, numel);
-  for_range(functor);
-}
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
index 2598ce093e6..fa2c9e6e8b7 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
@@ -181,6 +181,7 @@ void RnnKernel(const Context &dev_ctx,
   else if (mode == "RNN_TANH")
     rnn_mode = miopenRNNTANH;
 #else
+  VLOG(0) << "Leave lstmKernel.11";
   gpuRNNMode_t rnn_mode = CUDNN_LSTM;
   if (mode == "LSTM")
     rnn_mode = CUDNN_LSTM;
@@ -228,6 +229,7 @@ void RnnKernel(const Context &dev_ctx,
                     common::errors::InvalidArgument(
                         "ROCm do not support SequenceLength yet."));
 #endif
+  VLOG(0) << "Leave lstmKernel.12";
   std::vector<int> SequenceLength;
   if (has_seq_length) {
     SequenceLength = phi::GetVectorFromTensor<int>(sequence_length.get_ptr());

From 5a76d35b53e1f7d970d6b388969ba56ae955dc0d Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 30 Sep 2025 17:18:00 +0800
Subject: [PATCH 63/95]  metax_fix_ci (#79)

* [metax] add Rules

---------
---
 .github/workflows/metax_work.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index f14023848c6..f73442b6fd5 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -7,6 +7,7 @@ on:
     branches: [develop, release/**]
     paths:
       - "**"
+      - "Paddle/**"
       - "!backends/**"
       - "backends/metax_gpu/**"
 

From ceb55ebf2a0a0398f9fa318b79ac1e41a079a759 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Sat, 11 Oct 2025 09:45:57 +0800
Subject: [PATCH 64/95] [metax] add print tensor (#91)

* modify cmake for warpctc and warprnnt

* modify conv for tf32 and fp32

* modify conv kernel

* modify library to static library

* modify kernel

* modify fused_bias_dropout_residual_layer_norm

* modify compile

* modify blas

* modify blas

* modify blas

* modify blas

* modify context

* modify kernels

* modify kernels

* modify kernels

* add print tensor
---
 backends/metax_gpu/CMakeLists.txt             |   2 +
 .../flags_declare.cc}                         |  11 +
 backends/metax_gpu/common/utils.cc            | 297 ++++++++++++++++++
 backends/metax_gpu/common/utils.h             |  28 ++
 4 files changed, 338 insertions(+)
 rename backends/metax_gpu/{kernels/metax_kernel/flags_declare.cu => common/flags_declare.cc} (89%)
 create mode 100644 backends/metax_gpu/common/utils.cc
 create mode 100644 backends/metax_gpu/common/utils.h

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 475074ced89..e357a5e5912 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -648,6 +648,7 @@ file(
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/lars_momentum_kernel.cu
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/partial_sum_kernel.cu
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/gpu/gpu_info.cc
   # ############################################################################
   ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu
   # kernels/kps
@@ -687,6 +688,7 @@ file(
   RELATIVE ${CMAKE_SOURCE_DIR}
   runtime/runtime.cc
   passes/*.cc
+  common/*.cc
   kernels/*.cc
   kernels/*.cu
   kernels/fusion/*.cc
diff --git a/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu b/backends/metax_gpu/common/flags_declare.cc
similarity index 89%
rename from backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
rename to backends/metax_gpu/common/flags_declare.cc
index d7aefe54e9f..6b497cf9fdf 100644
--- a/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
+++ b/backends/metax_gpu/common/flags_declare.cc
@@ -80,6 +80,17 @@ PHI_DEFINE_EXPORTED_bool(
     "faster but it may loss precision in most case. If true, the compute "
     "type will be set to fp16. Default is false.");
 
+PHI_DEFINE_EXPORTED_string(
+    selected_gpus,
+    "",
+    "A list of device ids separated by comma, like: 0,1,2,3. "
+    "This option is useful when doing multi process training and "
+    "each process have only one device (GPU). If you want to use "
+    "all visible devices, set this to empty string. NOTE: the "
+    "reason of doing this is that we want to use P2P communication"
+    "between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
+    "share-memory only.");
+
 PHI_DEFINE_EXPORTED_bool(use_fast_math,
                          false,
                          "Whether to use fast math GPU functions.");
diff --git a/backends/metax_gpu/common/utils.cc b/backends/metax_gpu/common/utils.cc
new file mode 100644
index 00000000000..58e835687d9
--- /dev/null
+++ b/backends/metax_gpu/common/utils.cc
@@ -0,0 +1,297 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "common/utils.h"
+
+#include "glog/logging.h"
+#include "paddle/phi/backends/context_pool.h"
+#include "paddle/phi/backends/custom/custom_context.h"
+
+namespace phi {
+namespace {
+C_Status AsyncMemCpyH2D(const C_Device device,
+                        C_Stream stream,
+                        void* dst,
+                        const void* src,
+                        size_t size) {
+  if (size == 0) {
+    return C_SUCCESS;
+  }
+
+  if (dst == NULL || src == NULL) {
+    return C_ERROR;
+  }
+  cudaError_t cudaErr = cudaSetDevice(device->id);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+
+  cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+
+  return C_SUCCESS;
+}
+
+C_Status AsyncMemCpyD2H(const C_Device device,
+                        C_Stream stream,
+                        void* dst,
+                        const void* src,
+                        size_t size) {
+  if (size == 0) {
+    return C_SUCCESS;
+  }
+
+  if (dst == NULL || src == NULL) {
+    return C_ERROR;
+  }
+
+  cudaError_t cudaErr = cudaSetDevice(device->id);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+
+  cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToHost);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+
+  return C_SUCCESS;
+}
+
+C_Status AsyncMemCpyD2D(const C_Device device,
+                        C_Stream stream,
+                        void* dst,
+                        const void* src,
+                        size_t size) {
+  if (size == 0) {
+    VLOG(2) << "cudamemcpy successful: " << dst << " " << src << " "
+            << size;  // NOLINT
+    return C_SUCCESS;
+  }
+
+  if (dst == NULL || src == NULL) {
+    return C_ERROR;
+  }
+
+  cudaError_t cudaErr = cudaSetDevice(device->id);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+
+  cudaErr = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice);
+  if (cudaErr != cudaSuccess) {
+    return C_ERROR;
+  }
+  VLOG(2) << "cudamemcpy successful: " << dst << " " << src << " "
+          << size;  // NOLINT
+  return C_SUCCESS;
+}
+
+template <typename Context>
+inline void TensorCopy(const Context& dev_ctx,
+                       const phi::DenseTensor& src,
+                       bool blocking,
+                       phi::DenseTensor* dst,
+                       const phi::Place& dst_place = phi::CustomPlace()) {
+  auto* src_ptr = src.data();
+  const auto& src_place = src.place();
+  if (src_ptr == nullptr) {
+    return;
+  }
+  auto dst_place_ = dst_place;
+  if (dst_place_.GetType() != phi::AllocationType::CPU) {
+    dst_place_ = dev_ctx.GetPlace();
+  }
+
+  if (&src == dst) {
+    if (src_place == dst_place_) {
+      VLOG(6) << "Skip copy the same data(" << src_ptr << ") from " << src_place
+              << " to " << dst_place_;
+    } else {
+      VLOG(6) << "Src and dst are the same Tensor, in-place copy data("
+              << src_ptr << ") from " << src_place << " to " << dst_place_;
+      const phi::DenseTensor src_copy = src;
+      TensorCopy(dev_ctx, src_copy, blocking, dst, dst_place_);
+    }
+    return;
+  }
+
+  auto dst_dims = dst->dims();
+  dst->Resize(src.dims());
+  void* dst_ptr = nullptr;
+  if (dst_place_.GetType() != phi::AllocationType::CPU) {
+    dst_ptr = dev_ctx.Alloc(dst, src.dtype());
+  } else {
+    dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
+  }
+
+  PADDLE_ENFORCE_EQ(
+      dst->place(),
+      dst_place_,
+      phi::errors::Unavailable(
+          "The Dst Tensor's place and dst_place do not match, Tensor's place "
+          "place is %s, dst_place is %s.",
+          dst->place(),
+          dst_place_));
+
+  if (src_ptr == dst_ptr && src_place == dst_place_) {
+    if ((dst_dims == src.dims()) || (src_place == phi::CPUPlace())) {
+      VLOG(3) << "Skip copy the same data async from " << src_ptr << " in "
+              << src_place << " to " << dst_ptr << " in " << dst_place_;
+      return;
+    } else {
+      // scatter memory
+      phi::DenseTensor tmp_dst;
+      tmp_dst.set_meta(dst->meta());
+      tmp_dst.Resize(dst_dims);
+      dst_ptr = dev_ctx.Alloc(&tmp_dst, tmp_dst.dtype());
+      *dst = tmp_dst;
+    }
+  }
+  VLOG(4) << "src:" << src_ptr << " place: " << src_place
+          << " type:" << static_cast<int>(src_place.GetType())
+          << ", dst:" << dst_ptr << " place: " << dst_place_
+          << " type:" << static_cast<int>(dst_place_.GetType());
+
+  C_Stream stream = reinterpret_cast<C_Stream>(dev_ctx.stream());
+
+  auto size =
+      (src.dims().size() != 0 ? src.numel() : 1) * phi::SizeOf(src.dtype());
+  if (UNLIKELY(size) == 0) {
+    return;
+  }
+
+  if (src_place.GetType() == phi::AllocationType::CPU &&
+      dst_place_.GetType() == phi::AllocationType::CUSTOM) {
+    VLOG(6) << "TensorCopy from cpu to cus";
+    C_Device_st device;
+    device.id = dst_place_.GetDeviceId();
+    AsyncMemCpyH2D(&device, stream, dst_ptr, src_ptr, size);
+    if (blocking) {
+      dev_ctx.Wait();
+    }
+  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&
+             dst_place_.GetType() == phi::AllocationType::CPU) {
+    VLOG(6) << "TensorCopy from cus to cpu";
+    C_Device_st device;
+    device.id = src_place.GetDeviceId();
+    AsyncMemCpyD2H(&device, stream, dst_ptr, src_ptr, size);
+    if (blocking) {
+      dev_ctx.Wait();
+    }
+  } else if (src_place.GetType() == phi::AllocationType::CUSTOM &&
+             dst_place_.GetType() == phi::AllocationType::CUSTOM) {
+    VLOG(6) << "TensorCopy from cus to cus";
+    if (src_place.GetDeviceType() == dst_place_.GetDeviceType()) {
+      if (src_place.GetDeviceId() == dst_place_.GetDeviceId()) {
+        C_Device_st device;
+        device.id = src_place.GetDeviceId();
+        AsyncMemCpyD2D(&device, stream, dst_ptr, src_ptr, size);
+        if (blocking) {
+          dev_ctx.Wait();
+        }
+      } else {
+        PADDLE_THROW(
+            phi::errors::Unimplemented("TensorCopy is not supported."));
+      }
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented("TensorCopy is not supported."));
+    }
+  } else if (src_place.GetType() == phi::AllocationType::CPU &&
+             dst_place_.GetType() == phi::AllocationType::CPU) {
+    VLOG(6) << "TensorCopy from cpu to cpu";
+    std::memcpy(dst_ptr, src_ptr, size);
+  }
+}
+
+template <typename T = float>
+std::ostream& PrintTensor(std::ostream& os, const phi::DenseTensor& tensor) {
+  phi::DenseTensor cpu_tensor;
+  if (tensor.place().GetType() != phi::AllocationType::CPU) {
+    auto dev_ctx = static_cast<const phi::CustomContext*>(
+        phi::DeviceContextPool::Instance().Get(tensor.place()));
+    TensorCopy(*dev_ctx, tensor, true, &cpu_tensor, phi::CPUPlace());
+  } else {
+    cpu_tensor = tensor;
+  }
+  os << "DenseTensor<";
+  if (tensor.initialized()) {
+    os << phi::DataTypeToString(tensor.dtype()) << ", ";
+    os << tensor.place() << ", ";
+    os << "Shape(" << tensor.dims() << "), ";
+    os << "Strides(" << tensor.strides() << "), ";
+    os << "layout:" << tensor.layout() << ", ";
+    os << "data: [";
+
+    auto ptr = cpu_tensor.data<T>();
+    auto element_num = cpu_tensor.numel();
+    // Note: int8_t && uint8_t is typedef of char, ostream unable to print
+    // properly
+    if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
+      if (element_num > 0) {
+        os << signed(ptr[0]);
+        for (int j = 1; j < element_num; ++j) {
+          os << " " << signed(ptr[j]);
+        }
+      }
+    } else {
+      if (element_num > 0) {
+        os << ptr[0];
+        for (int j = 1; j < element_num; ++j) {
+          os << " " << ptr[j];
+        }
+      }
+    }
+    os << "]";
+  } else {
+    os << "NOT_INITED";
+  }
+  os << ">";
+  return os;
+}
+}  // namespace
+
+#define FOR_EACH_DATA_TYPE_TO_PRINT(_)      \
+  _(bool, phi::DataType::BOOL)              \
+  _(int8_t, phi::DataType::INT8)            \
+  _(uint8_t, phi::DataType::UINT8)          \
+  _(int16_t, phi::DataType::INT16)          \
+  _(uint16_t, phi::DataType::UINT16)        \
+  _(int32_t, phi::DataType::INT32)          \
+  _(uint32_t, phi::DataType::UINT32)        \
+  _(int64_t, phi::DataType::INT64)          \
+  _(uint64_t, phi::DataType::UINT64)        \
+  _(phi::bfloat16, phi::DataType::BFLOAT16) \
+  _(phi::float16, phi::DataType::FLOAT16)   \
+  _(float, phi::DataType::FLOAT32)          \
+  _(double, phi::DataType::FLOAT64)
+
+#define CALL_PRINT_TENSOR(cpp_type, data_type) \
+  case data_type:                              \
+    PrintTensor<cpp_type>(os, t);              \
+    break;
+
+std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t) {
+  switch (t.dtype()) {
+    FOR_EACH_DATA_TYPE_TO_PRINT(CALL_PRINT_TENSOR)
+    default:
+      VLOG(1) << "PrintTensor unrecognized data type:" << t.dtype();
+  }
+  return os;
+}
+#undef FOR_EACH_DATA_TYPE_TO_PRINT
+#undef CALL_PRINT_TENSOR
+}  // namespace phi
diff --git a/backends/metax_gpu/common/utils.h b/backends/metax_gpu/common/utils.h
new file mode 100644
index 00000000000..74e8aa9d788
--- /dev/null
+++ b/backends/metax_gpu/common/utils.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+std::ostream& operator<<(std::ostream& os, const phi::DenseTensor& t);
+}

From e533cc49db93959a0e5cabd00e3de8a71156b4b7 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 13 Oct 2025 10:05:21 +0800
Subject: [PATCH 65/95] [Metax] change_patch (#94)

* [metax] change_patch

---------
---
 backends/metax_gpu/patch/paddle.patch | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 69d714ef6e0..f2e4f067bb2 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -902,11 +902,11 @@ index 9d4bb18d55..ea42cc10a9 100644
          }
        }
 diff --git a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
-index b8cfdbf3ce..fa14b94a77 100644
+index acb3b83bc9..264d2a2b3e 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
-@@ -14,7 +14,7 @@
- 
+@@ -15,7 +15,7 @@
+ #include "paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.h"
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
@@ -915,11 +915,11 @@ index b8cfdbf3ce..fa14b94a77 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
-index e838778952..83e805e75a 100644
+index b2d15a59f8..f64582e85a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
-@@ -14,7 +14,7 @@
- 
+@@ -15,7 +15,7 @@
+ #include "paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.h"
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"

From d398e1a8627fc862d61ead0aa17f0f8a39715b97 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 13 Oct 2025 17:02:47 +0800
Subject: [PATCH 66/95] update paddle (#95)

* update paddle

---------
---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index 2588f489910..cc367e8767d 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 2588f4899106cd27bdfcc84ba4c2f5f7aac570ab
+Subproject commit cc367e8767d49819b5100f22e279cd62a1587670

From 813b9230bc7dc67adbface58967e32faf0119ce8 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Mon, 13 Oct 2025 18:33:50 +0800
Subject: [PATCH 67/95] [metax] fix dot error (#96)

* [metax] fix dot error

---------
---
 backends/metax_gpu/kernels/funcs/blas/blas.h |  8 +++++++-
 backends/metax_gpu/patch/paddle.patch        | 13 +++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/backends/metax_gpu/kernels/funcs/blas/blas.h b/backends/metax_gpu/kernels/funcs/blas/blas.h
index fa4b4643f89..75ea8c921e2 100644
--- a/backends/metax_gpu/kernels/funcs/blas/blas.h
+++ b/backends/metax_gpu/kernels/funcs/blas/blas.h
@@ -282,6 +282,9 @@ class Blas {
   template <typename T>
   T DOT(int n, const T* x, const T* y) const;
 
+  template <typename T>
+  void CUDOT(
+      int n, const T* x, int incx, const T* y, int incy, T* result) const;
   template <typename T>
   void SCAL(int n, const T a, T* x) const;
 
@@ -541,7 +544,10 @@ class BlasT : private Blas<DeviceContext> {
   T DOT(ARGS... args) const {
     return Base()->template DOT<T>(args...);
   }
-
+  template <typename... ARGS>
+  void CUDOT(ARGS... args) const {
+    Base()->template CUDOT<T>(args...);
+  }
   template <typename... ARGS>
   void SCAL(ARGS... args) const {
     Base()->template SCAL<T>(args...);
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index f2e4f067bb2..7ba32b5b399 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -942,6 +942,19 @@ index f0cca0f701..02ea957240 100644
  
  namespace phi {
  // To determine use cudnn or not.
+diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu
+index af27ac89ab..ee0edc6b8e 100644
+--- a/paddle/phi/kernels/gpu/dot_kernel.cu
++++ b/paddle/phi/kernels/gpu/dot_kernel.cu
+@@ -15,7 +15,7 @@
+ #include "paddle/phi/kernels/dot_kernel.h"
+ #include "paddle/phi/backends/gpu/gpu_context.h"
+ #include "paddle/phi/core/kernel_registry.h"
+-#include "paddle/phi/kernels/funcs/blas/blas.h"
++#include "kernels/funcs/blas/blas.h"
+ #include "paddle/phi/kernels/funcs/eigen/common.h"
+ 
+ #include "paddle/phi/kernels/full_kernel.h"
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
 index 29fa252e96..4ae72b0935 100644
 --- a/paddle/phi/kernels/gpu/gelu_funcs.h

From 6abf13c002bff418b261e20309f71fdd819c28eb Mon Sep 17 00:00:00 2001
From: metax666 <metax_pde@outlook.com>
Date: Tue, 14 Oct 2025 10:41:54 +0800
Subject: [PATCH 68/95] Update metax_work.yaml

---
 .github/workflows/metax_work.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index f73442b6fd5..fd7d04c0843 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -40,7 +40,7 @@ jobs:
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
             git checkout pull/${{ github.event.pull_request.number }}/head
 
-            # git submodule update --init --recursive
+            git submodule update --init --recursive
           fi
 
 

From 16d655b6ad22abe84e484a7bfe0a8c6c52d505a7 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 14 Oct 2025 15:22:59 +0800
Subject: [PATCH 69/95] [metax]rm opt path and fix activation_kernel bug (#98)

* [metax]rm opt path and fix activation_kernel bug

---------
---
 backends/metax_gpu/CMakeLists.txt             | 10 ++++----
 backends/metax_gpu/cmake/dgc.cmake            |  4 +--
 .../activation_grad_kernel_register.cu        | 25 +++++++++++++++----
 .../activation_kernel_register.cu             | 24 ++++++++++++++----
 4 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index e357a5e5912..3e92996f9a2 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -703,9 +703,9 @@ file(
 set(CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS})
 
 set_source_files_properties(${CUSTOM_DEVICE_SRCS} PROPERTIES LANGUAGE CUDA)
-
+set(MACA_PATH $ENV{MACA_PATH})
 set(CMAKE_CUCC_COMPILER "cucc")
-set(CMAKE_CUCC_FLAGS "-I /opt/maca/tools/cu-bridge/include/")
+set(CMAKE_CUCC_FLAGS "-I ${MACA_PATH}/tools/cu-bridge/include/")
 
 add_library(${TARGET_NAME} SHARED ${CUSTOM_DEVICE_SRCS})
 
@@ -734,9 +734,9 @@ target_link_libraries(
   ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
 
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so)
+target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmccl.so)
+target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcFlashAttn.so)
+target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcpti.so)
 
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
 
diff --git a/backends/metax_gpu/cmake/dgc.cmake b/backends/metax_gpu/cmake/dgc.cmake
index 4c54e636d5e..4c61f2e6bcb 100644
--- a/backends/metax_gpu/cmake/dgc.cmake
+++ b/backends/metax_gpu/cmake/dgc.cmake
@@ -62,8 +62,8 @@ if(EXISTS ${DGC_DOWNLOAD_DIR}/${DGC_CACHE_FILENAME})
 else()
   download_dgc()
 endif()
-
-set(CU_BRIDGE_PATH "/opt/maca/tools/cu-bridge")
+set(MACA_PATH $ENV{MACA_PATH})
+set(CU_BRIDGE_PATH "${MACA_PATH}/tools/cu-bridge")
 
 add_custom_command(
   OUTPUT "${CU_BRIDGE_PATH}/bin/nvcc"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
index 6cdfb2f5242..6c46ef10c0f 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
@@ -119,7 +119,22 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
     ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, &x, nullptr, &dout, dx, functor);              \
   }
-
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_DOUBLE_ATTRS_DEPX(  \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        double attr1,                           \
+                        double attr2,                           \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
 #define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
   template <typename T, typename Context>                             \
   void name##GradKernel(const Context& dev_ctx,                       \
@@ -239,10 +254,10 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
                                                scale_a,
                                                scale_b);
 
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
-                                               CudaSoftplusGradFunctor,
-                                               beta,
-                                               threshold);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_DOUBLE_ATTRS_DEPX(Softplus,
+                                                      CudaSoftplusGradFunctor,
+                                                      beta,
+                                                      threshold);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
                                                  CudaHardSigmoidGradFunctor,
                                                  slope,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
index f24f3e8abbc..363932cfc28 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
@@ -90,7 +90,21 @@ void ActivationGPUImpl(const Context& dev_ctx,
     ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, x, out, functor);                          \
   }
-
+#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_DOUBLE_ATTRS(        \
+    name, functor_class, attr1, attr2)                      \
+  template <typename T, typename Context>                   \
+  void name##Kernel(const Context& dev_ctx,                 \
+                    const DenseTensor& x,                   \
+                    double attr1,                           \
+                    double attr2,                           \
+                    DenseTensor* out) {                     \
+    funcs::functor_class<T> functor;                        \
+    auto attrs = functor.GetAttrs();                        \
+    *(attrs[0].second) = attr1;                             \
+    *(attrs[1].second) = attr2;                             \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                          \
+  }
 DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
@@ -139,10 +153,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardTanh,
                                      t_min,
                                      t_max)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b)
-DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus,
-                                     CudaSoftplusFunctor,
-                                     beta,
-                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_DOUBLE_ATTRS(Softplus,
+                                            CudaSoftplusFunctor,
+                                            beta,
+                                            threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                      CudaHardSigmoidFunctor,
                                      slope,

From 4b596b94e638e29c7b520f96524eb9bbf0acce4e Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 14 Oct 2025 17:17:54 +0800
Subject: [PATCH 70/95] updata_paddle (#99)

* updata paddle

---------
---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index cc367e8767d..89f4bd92f49 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit cc367e8767d49819b5100f22e279cd62a1587670
+Subproject commit 89f4bd92f49e15a9e1803a9e582526b2b8e4557d

From 94623f4d0492d688e8753655dc6229e7cecc0fa9 Mon Sep 17 00:00:00 2001
From: ZhouDuan <136539532+1184319564@users.noreply.github.com>
Date: Thu, 16 Oct 2025 10:34:54 +0800
Subject: [PATCH 71/95] [Metax] Fix some tests (#102)

* fix some tests
---
 backends/metax_gpu/tests/CMakeLists.txt       |    8 +-
 .../unit_test/test_conv3d_layer_metax.py      |  381 ++++++
 .../test_conv3d_transpose_op_metax.py         |  764 ++++++++++++
 .../test_conv3d_transpose_part2_op_metax.py   |  108 ++
 .../unit_test/test_deform_conv2d_metax.py     |  323 +++++
 .../test_deformable_conv_op_metax.py          |  504 ++++++++
 .../test_deformable_conv_v1_op_metax.py       |  319 +++++
 .../unit_test/test_einsum_0d_tensor_metax.py  |  201 +++
 .../tests/unit_test/test_fc_op_metax.py       |  138 ++
 .../test_imperative_double_grad_metax.py      | 1106 +++++++++++++++++
 .../unit_test/test_linalg_matrix_exp_metax.py |  268 ++++
 11 files changed, 4119 insertions(+), 1 deletion(-)
 create mode 100644 backends/metax_gpu/tests/unit_test/test_conv3d_layer_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_conv3d_transpose_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_conv3d_transpose_part2_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_deform_conv2d_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_deformable_conv_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_deformable_conv_v1_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_einsum_0d_tensor_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_fc_op_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_imperative_double_grad_metax.py
 create mode 100644 backends/metax_gpu/tests/unit_test/test_linalg_matrix_exp_metax.py

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 0c84ada4b65..084b5b8c601 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -49,7 +49,13 @@ foreach(test_name ${TEST_PROGRAMS})
     continue()
   endif()
 
-  set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+  string(FIND "${test_name}" "metax" METAX_SUFFIX_POS)
+  if(NOT METAX_SUFFIX_POS EQUAL -1)
+    set(CURRENT_TEST_PROGRAM ${METAX_UNIT_TEST_PATH}/${test_name}.py)
+  else()
+    set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+  endif()
+
   if(NOT EXISTS ${CURRENT_TEST_PROGRAM})
     message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.")
   else()
diff --git a/backends/metax_gpu/tests/unit_test/test_conv3d_layer_metax.py b/backends/metax_gpu/tests/unit_test/test_conv3d_layer_metax.py
new file mode 100644
index 00000000000..cd4cd290065
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_conv3d_layer_metax.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+from op_test import get_device_place, is_custom_device
+from test_conv3d_op import conv3d_forward_naive
+
+import paddle
+import paddle.base.dygraph as dg
+import paddle.nn.functional as F
+from paddle import base, nn
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+class Conv3DTestCase(unittest.TestCase):
+    def __init__(
+        self,
+        methodName="runTest",
+        batch_size=4,
+        spatial_shape=(8, 8, 8),
+        num_channels=6,
+        num_filters=8,
+        filter_size=3,
+        padding=0,
+        stride=1,
+        dilation=1,
+        groups=1,
+        no_bias=False,
+        data_format="NCDHW",
+        dtype="float32",
+    ):
+        super().__init__(methodName)
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_filters = num_filters
+        self.spatial_shape = spatial_shape
+        self.filter_size = filter_size
+
+        self.padding = padding
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.no_bias = no_bias
+        self.data_format = data_format
+        self.dtype = dtype
+
+    def setUp(self):
+        self.channel_last = self.data_format == "NDHWC"
+        if self.channel_last:
+            input_shape = (
+                self.batch_size,
+                *self.spatial_shape,
+                self.num_channels,
+            )
+        else:
+            input_shape = (
+                self.batch_size,
+                self.num_channels,
+                *self.spatial_shape,
+            )
+        self.input = np.random.randn(*input_shape).astype(self.dtype)
+
+        if isinstance(self.filter_size, int):
+            filter_size = [self.filter_size] * 3
+        else:
+            filter_size = self.filter_size
+        self.weight_shape = weight_shape = (
+            self.num_filters,
+            self.num_channels // self.groups,
+            *filter_size,
+        )
+        self.weight = np.random.uniform(-1, 1, size=weight_shape).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(-1, 1, size=(self.num_filters,)).astype(
+                self.dtype
+            )
+        else:
+            self.bias = None
+
+    def base_layer(self, place):
+        main = base.Program()
+        start = base.Program()
+        with (
+            base.unique_name.guard(),
+            base.program_guard(main, start),
+        ):
+            input_shape = (
+                (-1, -1, -1, -1, self.num_channels)
+                if self.channel_last
+                else (-1, self.num_channels, -1, -1, -1)
+            )
+            x_var = paddle.static.data("input", input_shape, dtype=self.dtype)
+            weight_attr = paddle.nn.initializer.Assign(self.weight)
+            if self.bias is None:
+                bias_attr = False
+            else:
+                bias_attr = paddle.nn.initializer.Assign(self.bias)
+            y_var = paddle.nn.Conv3D(
+                in_channels=self.num_channels,
+                out_channels=self.num_filters,
+                kernel_size=self.filter_size,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                padding_mode="zeros",
+                weight_attr=weight_attr,
+                bias_attr=bias_attr,
+                data_format=self.data_format,
+            )(x_var)
+        feed_dict = {"input": self.input}
+        exe = base.Executor(place)
+        exe.run(start)
+        (y_np,) = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def functional(self, place):
+        main = base.Program()
+        start = base.Program()
+        with (
+            base.unique_name.guard(),
+            base.program_guard(main, start),
+        ):
+            input_shape = (
+                (-1, -1, -1, -1, self.num_channels)
+                if self.channel_last
+                else (-1, self.num_channels, -1, -1, -1)
+            )
+            x_var = paddle.static.data("input", input_shape, dtype=self.dtype)
+            w_var = paddle.static.data("weight", self.weight_shape, dtype=self.dtype)
+            if not self.no_bias:
+                b_var = paddle.static.data(
+                    "bias", (self.num_filters,), dtype=self.dtype
+                )
+            else:
+                b_var = None
+            y_var = F.conv3d(
+                x_var,
+                w_var,
+                b_var,
+                padding=self.padding,
+                stride=self.stride,
+                dilation=self.dilation,
+                groups=self.groups,
+                data_format=self.data_format,
+            )
+        feed_dict = {"input": self.input, "weight": self.weight}
+        if self.bias is not None:
+            feed_dict["bias"] = self.bias
+        exe = base.Executor(place)
+        exe.run(start)
+        (y_np,) = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_nn_layer(self):
+        x_var = paddle.to_tensor(self.input)
+        x_var.stop_gradient = False
+        conv = nn.Conv3D(
+            self.num_channels,
+            self.num_filters,
+            self.filter_size,
+            padding=self.padding,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=self.groups,
+            data_format=self.data_format,
+        )
+        conv.weight.set_value(self.weight)
+        if not self.no_bias:
+            conv.bias.set_value(self.bias)
+        y_var = conv(x_var)
+        y_var.backward()
+        y_np = y_var.numpy()
+        t1 = x_var.gradient()
+        return y_np, t1
+
+    def _test_pir_equivalence(self, place):
+        with paddle.pir_utils.IrGuard():
+            result1 = self.base_layer(place)
+            result2 = self.functional(place)
+        with dg.guard(place):
+            result3, g1 = self.paddle_nn_layer()
+        np.testing.assert_array_almost_equal(result1, result2)
+        np.testing.assert_array_almost_equal(result2, result3)
+
+    def runTest(self):
+        place = base.CPUPlace()
+        self._test_pir_equivalence(place)
+
+        if base.core.is_compiled_with_cuda() or is_custom_device():
+            place = get_device_place()
+            self._test_pir_equivalence(place)
+
+
+class Conv3DErrorTestCase(Conv3DTestCase):
+    def runTest(self):
+        place = base.CPUPlace()
+        with (
+            dg.guard(place),
+            self.assertRaises(ValueError),
+        ):
+            self.paddle_nn_layer()
+
+
+def add_cases(suite):
+    suite.addTest(Conv3DTestCase(methodName="runTest"))
+    suite.addTest(Conv3DTestCase(methodName="runTest", stride=[1, 2, 1], dilation=2))
+    suite.addTest(Conv3DTestCase(methodName="runTest", stride=2, dilation=(2, 1, 2)))
+    suite.addTest(Conv3DTestCase(methodName="runTest", padding="same", no_bias=True))
+    suite.addTest(
+        Conv3DTestCase(methodName="runTest", filter_size=(3, 2, 3), padding="valid")
+    )
+    suite.addTest(Conv3DTestCase(methodName="runTest", padding=(2, 3, 1)))
+    suite.addTest(Conv3DTestCase(methodName="runTest", padding=[1, 2, 2, 1, 2, 3]))
+    suite.addTest(
+        Conv3DTestCase(
+            methodName="runTest",
+            padding=[[0, 0], [0, 0], [1, 2], [2, 1], [2, 2]],
+        )
+    )
+    suite.addTest(Conv3DTestCase(methodName="runTest", data_format="NDHWC"))
+    suite.addTest(
+        Conv3DTestCase(
+            methodName="runTest",
+            data_format="NDHWC",
+            padding=[[0, 0], [1, 1], [3, 3], [2, 2], [0, 0]],
+        )
+    )
+    suite.addTest(Conv3DTestCase(methodName="runTest", groups=2, padding="valid"))
+    suite.addTest(
+        Conv3DTestCase(
+            methodName="runTest",
+            num_filters=6,
+            num_channels=3,
+            groups=3,
+            padding="valid",
+        )
+    )
+
+
+def add_error_cases(suite):
+    suite.addTest(Conv3DErrorTestCase(methodName="runTest", num_channels=5, groups=2))
+    suite.addTest(
+        Conv3DErrorTestCase(
+            methodName="runTest", num_channels=5, groups=2, padding=[-1, 1, 3]
+        )
+    )
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+def get_places():
+    places = []
+    if core.is_compiled_with_xpu():
+        places.append(paddle.device.XPUPlace(0))
+    elif core.is_compiled_with_cuda():
+        places.append(paddle.CUDAPlace(0))
+    places.append(paddle.CPUPlace())
+    return places
+
+
+class TestConv3dAPI_Compatibility(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2025)
+        self.places = get_places()
+        self.shape_x = [2, 3, 8, 8, 8]  # NCDHW
+        self.shape_w = [6, 3, 3, 3, 3]  # Co, Cin, kD, kH, kW
+        self.dtype = "float32"
+        self.init_data()
+
+    def init_data(self):
+        self.np_x = np.random.rand(*self.shape_x).astype(self.dtype)
+        self.np_w = np.random.rand(*self.shape_w).astype(self.dtype)
+        conv_param = {
+            "stride": [1, 1, 1],
+            "pad": [0, 0, 0],
+            "dilation": [1, 1, 1],
+        }
+        self.np_ref_out = conv3d_forward_naive(self.np_x, self.np_w, 1, conv_param)
+
+    def test_dygraph_Compatibility(self):
+        for place in self.places:
+            paddle.device.set_device(place)
+            paddle.disable_static()
+            x = paddle.to_tensor(self.np_x)
+            w = paddle.to_tensor(self.np_w)
+
+            paddle_dygraph_out = []
+            # Position args (args)
+            out1 = paddle.nn.functional.conv3d(x, w)
+            paddle_dygraph_out.append(out1)
+            # Key words args (kwargs) for paddle
+            out2 = paddle.nn.functional.conv3d(x=x, weight=w)
+            paddle_dygraph_out.append(out2)
+            # Key words args for alias compatibility
+            out3 = paddle.nn.functional.conv3d(input=x, weight=w)
+            paddle_dygraph_out.append(out3)
+            # Combined args and kwargs
+            out4 = paddle.nn.functional.conv3d(x, weight=w)
+            paddle_dygraph_out.append(out4)
+
+            # refer to test/xpu/test_conv3d_op_xpu.py
+            if isinstance(place, core.XPUPlace):
+                rtol = 5e-3
+                atol = 5e-3
+            else:
+                rtol = 1e-5
+                atol = 0
+
+            # Check all dygraph results against reference
+            for out in paddle_dygraph_out:
+                np.testing.assert_allclose(
+                    self.np_ref_out, out.numpy(), rtol=rtol, atol=atol
+                )
+            paddle.enable_static()
+
+    def test_static_Compatibility(self):
+        paddle.enable_static()
+
+        fetch_list = []
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with base.program_guard(main, startup):
+            x = paddle.static.data(name="x", shape=self.shape_x, dtype=self.dtype)
+            w = paddle.static.data(name="w", shape=self.shape_w, dtype=self.dtype)
+
+            # Position args (args)
+            out1 = paddle.nn.functional.conv3d(x, w)
+            fetch_list.append(out1)
+            # Key words args (kwargs) for paddle
+            out2 = paddle.nn.functional.conv3d(x=x, weight=w)
+            fetch_list.append(out2)
+            # Key words args for alias compatibility
+            out3 = paddle.nn.functional.conv3d(input=x, weight=w)
+            fetch_list.append(out3)
+            # Combined args and kwargs
+            out4 = paddle.nn.functional.conv3d(x, weight=w)
+            fetch_list.append(out4)
+
+            for place in self.places:
+                # refer to test/xpu/test_conv2d_op_xpu.py
+                if isinstance(place, core.XPUPlace):
+                    rtol = 5e-3
+                    atol = 5e-3
+                else:
+                    rtol = 1e-5
+                    atol = 0
+
+                exe = base.Executor(place)
+                fetches = exe.run(
+                    main,
+                    feed={"x": self.np_x, "w": self.np_w},
+                    fetch_list=fetch_list,
+                )
+                for out in fetches:
+                    np.testing.assert_allclose(
+                        out, self.np_ref_out, rtol=rtol, atol=atol
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_op_metax.py b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_op_metax.py
new file mode 100644
index 00000000000..6f55aac3361
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_op_metax.py
@@ -0,0 +1,764 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+paddle.enable_static()
+from op_test import (
+    OpTest,
+    copy_bits_from_float_to_uint16,
+    get_device_place,
+    is_custom_device,
+)
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+def convert_float_to_uint16(float_list, data_format="NCHW"):
+    if data_format == "NHWC":
+        float_list = np.transpose(float_list, [0, 4, 1, 2, 3])
+
+    new_output = []
+    for x in np.nditer(float_list):
+        new_output.append(np.uint16(copy_bits_from_float_to_uint16(x)))
+    new_output = np.reshape(new_output, float_list.shape).view(np.uint16)
+
+    if data_format == "NHWC":
+        new_output = np.transpose(new_output, [0, 2, 3, 4, 1])
+    return new_output
+
+
+def conv3dtranspose_forward_naive(input_, filter_, attrs):
+    padding_algorithm = attrs["padding_algorithm"]
+    if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+        raise ValueError(
+            f"Unknown Attr(padding_algorithm): '{padding_algorithm}'. "
+            "It can only be 'SAME' or 'VALID'."
+        )
+
+    if attrs["data_format"] == "NHWC":
+        input_ = np.transpose(input_, [0, 4, 1, 2, 3])
+    in_n, in_c, in_d, in_h, in_w = input_.shape
+    f_c, f_out_c, f_d, f_h, f_w = filter_.shape
+    groups = attrs["groups"]
+    assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c // groups
+
+    stride, pad, dilations = (
+        attrs["strides"],
+        attrs["paddings"],
+        attrs["dilations"],
+    )
+
+    def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(
+            input_shape, kernel_size, kernel_stride
+        ):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0)
+            )
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    ksize = filter_.shape[2:5]
+    if padding_algorithm == "VALID":
+        pad = [0, 0, 0, 0, 0, 0]
+    elif padding_algorithm == "SAME":
+        dilations = [1, 1, 1]
+        input_data_shape = input_.shape[2:5]
+        pad = _get_padding_with_SAME(input_data_shape, ksize, stride)
+
+    pad_d_0, pad_d_1 = pad[0], pad[0]
+    pad_h_0, pad_h_1 = pad[1], pad[1]
+    pad_w_0, pad_w_1 = pad[2], pad[2]
+    if len(pad) == 6:
+        pad_d_0, pad_d_1 = pad[0], pad[1]
+        pad_h_0, pad_h_1 = pad[2], pad[3]
+        pad_w_0, pad_w_1 = pad[4], pad[5]
+
+    d_block_d = dilations[0] * (f_d - 1) + 1
+    d_block_h = dilations[1] * (f_h - 1) + 1
+    d_block_w = dilations[2] * (f_w - 1) + 1
+    out_d = (in_d - 1) * stride[0] + d_block_d
+    out_h = (in_h - 1) * stride[1] + d_block_h
+    out_w = (in_w - 1) * stride[2] + d_block_w
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    for n in range(in_n):
+        for d in range(in_d):
+            for i in range(in_h):
+                for j in range(in_w):
+                    for g in range(groups):
+                        input_masked = input_[
+                            n, g * sub_in_c : (g + 1) * sub_in_c, d, i, j
+                        ]  # (c)
+                        input_masked = np.reshape(input_masked, (sub_in_c, 1, 1, 1))
+                        input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
+
+                        for k in range(f_out_c):
+                            tmp_out = np.sum(
+                                input_masked
+                                * filter_[
+                                    g * sub_in_c : (g + 1) * sub_in_c,
+                                    k,
+                                    :,
+                                    :,
+                                    :,
+                                ],
+                                axis=0,
+                            )
+                            d1, d2 = d * stride[0], d * stride[0] + d_block_d
+                            i1, i2 = i * stride[1], i * stride[1] + d_block_h
+                            j1, j2 = j * stride[2], j * stride[2] + d_block_w
+                            out[
+                                n,
+                                g * f_out_c + k,
+                                d1 : d2 : dilations[0],
+                                i1 : i2 : dilations[1],
+                                j1 : j2 : dilations[2],
+                            ] += tmp_out
+
+    out = out[
+        :,
+        :,
+        pad_d_0 : out_d - pad_d_1,
+        pad_h_0 : out_h - pad_h_1,
+        pad_w_0 : out_w - pad_w_1,
+    ]
+    if attrs["data_format"] == "NHWC":
+        out = np.transpose(out, [0, 2, 3, 4, 1])
+    return out
+
+
+def create_test_cudnn_fp16_class(parent, grad_check=True):
+    @unittest.skipIf(
+        not (core.is_compiled_with_cuda() or is_custom_device()),
+        "core is not compiled with CUDA",
+    )
+    class TestConv3DTransposeCUDNNFP16(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda() or is_custom_device():
+                place = get_device_place()
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=2e-2)
+
+        def test_check_grad_no_filter(self):
+            place = get_device_place()
+            if core.is_float16_supported(place) and grad_check:
+                self.check_grad_with_place(
+                    place, ["Input"], "Output", no_grad_set={"Filter"}
+                )
+
+        def test_check_grad_no_input(self):
+            place = get_device_place()
+            if core.is_float16_supported(place) and grad_check:
+                self.check_grad_with_place(
+                    place, ["Filter"], "Output", no_grad_set={"Input"}
+                )
+
+    cls_name = "{}_{}".format(parent.__name__, "CUDNNFP16OP")
+    TestConv3DTransposeCUDNNFP16.__name__ = cls_name
+    globals()[cls_name] = TestConv3DTransposeCUDNNFP16
+
+
+def create_test_cudnn_bf16_class(parent):
+    @unittest.skipIf(
+        not (core.is_compiled_with_cuda() or is_custom_device())
+        or not core.is_bfloat16_supported(get_device_place()),
+        "core is not compiled with CUDA and do not support bfloat16",
+    )
+    class TestConv3DTransposeCUDNNBF16(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = get_device_place()
+            self.check_output_with_place(place)
+
+        def test_check_grad(self):
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                {"Input", "Filter"},
+                "Output",
+            )
+
+        def test_check_grad_no_filter(self):
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Input"],
+                "Output",
+                no_grad_set={"Filter"},
+            )
+
+        def test_check_grad_no_input(self):
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Filter"],
+                "Output",
+                no_grad_set={"Input"},
+            )
+
+    cls_name = "{}_{}".format(parent.__name__, "CUDNNBF16OP")
+    TestConv3DTransposeCUDNNBF16.__name__ = cls_name
+    globals()[cls_name] = TestConv3DTransposeCUDNNBF16
+
+
+def conv3d_transpose_wrapper(
+    x,
+    weight,
+    stride=1,
+    padding=0,
+    output_padding=[],
+    output_size=[],
+    padding_algorithm="EXPLICIT",
+    groups=1,
+    dilation=1,
+    data_format="NCDHW",
+):
+    if data_format == "AnyLayout":
+        data_format = "NCDHW"
+    return paddle._C_ops.conv3d_transpose(
+        x,
+        weight,
+        stride,
+        padding,
+        output_padding,
+        output_size,
+        padding_algorithm,
+        groups,
+        dilation,
+        data_format,
+    )
+
+
+class TestConv3DTransposeOp(OpTest):
+    def setUp(self):
+        # init as conv transpose
+        self.use_cudnn = False
+        self.check_no_input = False
+        self.check_no_filter = False
+        self.data_format = "NCHW"
+        self.pad = [0, 0, 0]
+        self.padding_algorithm = "EXPLICIT"
+        self.init_op_type()
+        self.init_kernel_type()
+        self.init_test_case()
+
+        if self.is_bfloat16_op():
+            input = np.random.random(self.input_size).astype(np.float32)
+            filter = np.random.random(self.filter_size).astype(np.float32)
+        else:
+            input = np.random.random(self.input_size).astype(self.dtype)
+            filter = np.random.random(self.filter_size).astype(self.dtype)
+
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "padding_algorithm": self.padding_algorithm,
+            "dilations": self.dilations,
+            "groups": self.groups,
+            "use_cudnn": self.use_cudnn,
+            "data_format": self.data_format,
+        }
+
+        output = conv3dtranspose_forward_naive(input, filter, self.attrs).astype(
+            "float32"
+        )
+
+        if self.is_bfloat16_op():
+            self.inputs = {
+                "Input": convert_float_to_uint16(input),
+                "Filter": convert_float_to_uint16(filter),
+            }
+        else:
+            self.inputs = {
+                "Input": input,
+                "Filter": filter,
+            }
+            output = output.astype(self.dtype)
+
+        self.outputs = {"Output": output}
+
+    def test_check_output(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
+
+    def test_check_grad(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                {"Input", "Filter"},
+                "Output",
+                max_relative_error=0.03,
+            )
+        else:
+            self.check_grad({"Input", "Filter"}, "Output", max_relative_error=0.03)
+
+    def test_check_grad_no_filter(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Input"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Filter"},
+            )
+        elif self.check_no_filter:
+            self.check_grad(
+                ["Input"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Filter"},
+            )
+
+    def test_check_grad_no_input(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Filter"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Input"},
+            )
+        elif self.check_no_input:
+            self.check_grad(
+                ["Filter"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Input"},
+            )
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+    def init_kernel_type(self):
+        self.dtype = np.float32
+
+
+class TestWithSymmetricPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_input = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithAsymmetricPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 0, 1, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithSAMEPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.stride = [1, 1, 2]
+        self.dilations = [1, 2, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 6]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 4]
+        self.padding_algorithm = "SAME"
+
+
+class TestWithVALIDPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.stride = [2, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 4, 3]
+        self.padding_algorithm = "VALID"
+
+
+class TestWithStride(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_filter = True
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithGroups(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [1, 2, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+
+class TestWithDilation(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class Test_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+# ------------ test_cudnn ------------
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNN(TestConv3DTransposeOp):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1, 0, 0, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 4, 4, 4]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithSAMEPad(TestWithSAMEPad):
+    def init_test_case(self):
+        self.stride = [1, 1, 2]
+        self.dilations = [1, 2, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 4, 3]
+        self.padding_algorithm = "SAME"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithVALIDPad(TestWithVALIDPad):
+    def init_test_case(self):
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.padding_algorithm = "VALID"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithGroups(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [1, 2, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+        # Please Don't remove the following code.
+        # Currently, CI use cudnn V5.0 which not support dilation conv.
+        # class TestCUDNNWithDilation(TestWithDilation):
+        #     def init_test_case(self):
+        #         self.pad = [1, 1, 1]
+        #         self.stride = [2, 2, 2]
+        #         self.dilations = [2, 2, 2]
+        #         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        #         f_c = self.input_size[1]
+        #         self.filter_size = [f_c, 6, 3, 3, 3]
+        #
+        #     def init_op_type(self):
+        #         self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNN_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 0, 0, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithStride_NHWC(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithGroups_NHWC(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+# ----------------Conv3DTransposeCUDNN fp16----------------
+create_test_cudnn_fp16_class(TestConv3DTransposeOp)
+create_test_cudnn_fp16_class(TestWithSymmetricPad)
+create_test_cudnn_fp16_class(TestWithAsymmetricPad)
+create_test_cudnn_fp16_class(TestWithSAMEPad)
+create_test_cudnn_fp16_class(TestWithVALIDPad)
+create_test_cudnn_fp16_class(TestWithStride)
+create_test_cudnn_fp16_class(TestWithGroups)
+create_test_cudnn_fp16_class(TestWithDilation)
+create_test_cudnn_fp16_class(Test_NHWC)
+
+
+# ----------------Conv3DTransposeCUDNN bf16----------------
+create_test_cudnn_bf16_class(TestConv3DTransposeOp)
+create_test_cudnn_bf16_class(TestWithSymmetricPad)
+create_test_cudnn_bf16_class(TestWithAsymmetricPad)
+create_test_cudnn_bf16_class(TestWithSAMEPad)
+create_test_cudnn_bf16_class(TestWithVALIDPad)
+create_test_cudnn_bf16_class(TestWithStride)
+create_test_cudnn_bf16_class(TestWithGroups)
+create_test_cudnn_bf16_class(TestWithDilation)
+create_test_cudnn_bf16_class(Test_NHWC)
+
+
+class TestConv3dTranspose(unittest.TestCase):
+    def error_weight_input(self):
+        array = np.array([1], dtype=np.float32)
+        x = paddle.to_tensor(np.reshape(array, [1, 1, 1, 1, 1]), dtype="float32")
+        weight = paddle.to_tensor(np.reshape(array, [1]), dtype="float32")
+        paddle.nn.functional.conv3d_transpose(x, weight, bias=0)
+
+    def test_type_error(self):
+        self.assertRaises(ValueError, self.error_weight_input)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_part2_op_metax.py b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_part2_op_metax.py
new file mode 100644
index 00000000000..9bf91f5908f
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_part2_op_metax.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+
+sys.path.append("../../legacy_test")
+from test_conv3d_transpose_op import (
+    TestConv3DTransposeOp,
+    create_test_cudnn_bf16_class,
+    create_test_cudnn_fp16_class,
+)
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+class TestWithSymmetricPad_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithAsymmetricPad_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 0, 1, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithGroups_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_filter = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [2, 5, 5, 5, 4]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithStride_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NCDHW
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithDilation_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_input = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NCDHW
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+# ----------------Conv3DTransposeCUDNN fp16----------------
+create_test_cudnn_fp16_class(TestWithSymmetricPad_NHWC)
+create_test_cudnn_fp16_class(TestWithAsymmetricPad_NHWC)
+create_test_cudnn_fp16_class(TestWithGroups_NHWC)
+create_test_cudnn_fp16_class(TestWithStride_NHWC)
+create_test_cudnn_fp16_class(TestWithDilation_NHWC)
+
+
+# ----------------Conv3DTransposeCUDNN bf16----------------
+create_test_cudnn_bf16_class(TestWithSymmetricPad_NHWC)
+create_test_cudnn_bf16_class(TestWithAsymmetricPad_NHWC)
+create_test_cudnn_bf16_class(TestWithGroups_NHWC)
+create_test_cudnn_bf16_class(TestWithStride_NHWC)
+create_test_cudnn_bf16_class(TestWithDilation_NHWC)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_deform_conv2d_metax.py b/backends/metax_gpu/tests/unit_test/test_deform_conv2d_metax.py
new file mode 100644
index 00000000000..da5eeb34d0b
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_deform_conv2d_metax.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from unittest import TestCase
+
+import numpy as np
+from op_test import get_device_place, is_custom_device
+
+import paddle
+import paddle.nn.initializer as I
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+
+class TestDeformConv2D(TestCase):
+    batch_size = 4
+    spatial_shape = (5, 5)
+    dtype = "float32"
+
+    def setUp(self):
+        self.in_channels = 2
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [0, 0]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = True
+
+    def prepare(self):
+        np.random.seed(1)
+        paddle.seed(1)
+        if isinstance(self.kernel_size, int):
+            filter_shape = (self.kernel_size,) * 2
+        else:
+            filter_shape = tuple(self.kernel_size)
+        self.filter_shape = filter_shape
+
+        self.weight = np.random.uniform(
+            -1,
+            1,
+            (self.out_channels, self.in_channels // self.groups, *filter_shape),
+        ).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(-1, 1, (self.out_channels,)).astype(
+                self.dtype
+            )
+
+        def out_size(in_size, pad_size, dilation_size, kernel_size, stride_size):
+            return (
+                in_size + 2 * pad_size - (dilation_size * (kernel_size - 1) + 1)
+            ) / stride_size + 1
+
+        out_h = int(
+            out_size(
+                self.spatial_shape[0],
+                self.padding[0],
+                self.dilation[0],
+                self.kernel_size[0],
+                self.stride[0],
+            )
+        )
+        out_w = int(
+            out_size(
+                self.spatial_shape[1],
+                self.padding[1],
+                self.dilation[1],
+                self.kernel_size[1],
+                self.stride[1],
+            )
+        )
+        out_shape = (out_h, out_w)
+
+        self.input_shape = (
+            self.batch_size,
+            self.in_channels,
+            *self.spatial_shape,
+        )
+
+        self.offset_shape = (
+            self.batch_size,
+            self.deformable_groups * 2 * filter_shape[0] * filter_shape[1],
+            *out_shape,
+        )
+
+        self.mask_shape = (
+            self.batch_size,
+            self.deformable_groups * filter_shape[0] * filter_shape[1],
+            *out_shape,
+        )
+
+        self.input = np.random.uniform(-1, 1, self.input_shape).astype(self.dtype)
+
+        self.offset = np.random.uniform(-1, 1, self.offset_shape).astype(self.dtype)
+
+        self.mask = np.random.uniform(-1, 1, self.mask_shape).astype(self.dtype)
+
+    def static_graph_case_dcn(self):
+        main = paddle.static.Program()
+        start = paddle.static.Program()
+        paddle.enable_static()
+        with paddle.static.program_guard(main, start):
+            x = paddle.static.data(
+                "input", (-1, self.in_channels, -1, -1), dtype=self.dtype
+            )
+            offset = paddle.static.data(
+                "offset",
+                (
+                    -1,
+                    self.deformable_groups
+                    * 2
+                    * self.filter_shape[0]
+                    * self.filter_shape[1],
+                    -1,
+                    -1,
+                ),
+                dtype=self.dtype,
+            )
+            mask = paddle.static.data(
+                "mask",
+                (
+                    -1,
+                    self.deformable_groups
+                    * self.filter_shape[0]
+                    * self.filter_shape[1],
+                    -1,
+                    -1,
+                ),
+                dtype=self.dtype,
+            )
+
+            y_v1 = paddle.vision.ops.DeformConv2D(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=self.deformable_groups,
+                weight_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias),
+            )(x, offset, None)
+
+            y_v2 = paddle.vision.ops.DeformConv2D(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=self.deformable_groups,
+                weight_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias),
+            )(x, offset, mask)
+
+        exe = paddle.static.Executor(self.place)
+        exe.run(start)
+        out_v1, out_v2 = exe.run(
+            main,
+            feed={
+                "input": self.input,
+                "offset": self.offset,
+                "mask": self.mask,
+            },
+            fetch_list=[y_v1, y_v2],
+        )
+        return out_v1, out_v2
+
+    def dygraph_case_dcn(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.input)
+        offset = paddle.to_tensor(self.offset)
+        mask = paddle.to_tensor(self.mask)
+
+        bias = None if self.no_bias else paddle.to_tensor(self.bias)
+
+        deform_conv2d = paddle.vision.ops.DeformConv2D(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            deformable_groups=self.deformable_groups,
+            groups=self.groups,
+            weight_attr=I.Assign(self.weight),
+            bias_attr=False if self.no_bias else I.Assign(self.bias),
+        )
+
+        y_v1 = deform_conv2d(x, offset)
+        y_v2 = deform_conv2d(x, offset, mask)
+
+        out_v1 = y_v1.numpy()
+        out_v2 = y_v2.numpy()
+
+        return out_v1, out_v2
+
+    def _test_identity(self):
+        self.prepare()
+        static_dcn_v1, static_dcn_v2 = self.static_graph_case_dcn()
+        dy_dcn_v1, dy_dcn_v2 = self.dygraph_case_dcn()
+        np.testing.assert_array_almost_equal(static_dcn_v1, dy_dcn_v1)
+        np.testing.assert_array_almost_equal(static_dcn_v2, dy_dcn_v2)
+
+    def test_identity(self):
+        self.place = paddle.CPUPlace()
+        self._test_identity()
+
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            self.place = get_device_place()
+            self._test_identity()
+
+
+# testcases for DeformConv2D
+class TestDeformConv2DWithPadding(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = True
+
+
+class TestDeformConv2DWithBias(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithAsynPadding(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithDilation(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [3, 3]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithStride(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithDeformable_Groups(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 5
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 5
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithGroups(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 5
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 5
+        self.no_bias = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_deformable_conv_op_metax.py b/backends/metax_gpu/tests/unit_test/test_deformable_conv_op_metax.py
new file mode 100644
index 00000000000..1f26abb73f8
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_deformable_conv_op_metax.py
@@ -0,0 +1,504 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from itertools import product
+
+import numpy as np
+from op_test import OpTest
+
+import paddle
+
+paddle.enable_static()
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+
+def dmc_bilinear(data_im, height, width, h, w):
+    h_low = int(np.floor(h))
+    w_low = int(np.floor(w))
+    h_high = h_low + 1
+    w_high = w_low + 1
+
+    lh = h - h_low
+    lw = w - w_low
+    hh = 1 - lh
+    hw = 1 - lw
+
+    v1 = 0
+    if h_low >= 0 and w_low >= 0:
+        v1 = data_im[h_low, w_low]
+    v2 = 0
+    if h_low >= 0 and w_high <= width - 1:
+        v2 = data_im[h_low, w_high]
+    v3 = 0
+    if h_high <= height - 1 and w_low >= 0:
+        v3 = data_im[h_high, w_low]
+    v4 = 0
+    if h_high <= height - 1 and w_high <= width - 1:
+        v4 = data_im[h_high, w_high]
+
+    w1, w2, w3, w4 = hh * hw, hh * lw, lh * hw, lh * lw
+    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+
+    return val
+
+
+def dconv_im2col_gemm(input, offset, mask, filter, group, conv_param):
+    in_n, in_c, in_h, in_w = input.shape
+    out_c, f_c, f_h, f_w = filter.shape
+
+    assert offset.shape == (in_n, 2 * f_h * f_w, in_h, in_w)
+    assert mask.shape == (in_n, f_h * f_w, in_h, in_w)
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+
+    stride, pad, dilation = (
+        conv_param["stride"],
+        conv_param["pad"],
+        conv_param["dilation"],
+    )
+    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
+    assert out_h == in_h
+    assert out_w == in_w
+
+    col_buffer = np.zeros((in_n, in_c * f_h * f_w, in_h * in_w))
+    for n, c, h, w, kh, kw in product(
+        range(in_n),
+        range(in_c),
+        range(out_h),
+        range(out_w),
+        range(f_h),
+        range(f_w),
+    ):
+        offset_h_table = offset[n, ::2, h, w].reshape(f_h, f_w)
+        offset_w_table = offset[n, 1::2, h, w].reshape(f_h, f_w)
+        mask_table = mask[n, :, h, w].reshape(f_h, f_w)
+        offset_h = offset_h_table[kh, kw]
+        offset_w = offset_w_table[kh, kw]
+        val = 0
+        im_h = h * stride[0] + kh * dilation[0] + offset_h - pad[0]
+        im_w = w * stride[0] + kw * dilation[0] + offset_w - pad[1]
+        if im_h > -1 and im_w > -1 and im_h < in_h and im_w < in_h:
+            val = dmc_bilinear(input[n, c], in_h, in_w, im_h, im_w)
+        val_out = val * mask_table[kh, kw]
+        col_buffer[n, c * f_h * f_w + kh * f_w + kw, h * in_w + w] = val_out
+
+    out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
+    weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
+    col_buffer = col_buffer.reshape(
+        (in_n, group, int(in_c // group * f_h * f_w), in_h * in_w)
+    )
+    for n in range(in_n):
+        for g in range(group):
+            out[n, g] = np.matmul(weight[g], col_buffer[n, g])
+    out = out.reshape(in_n, out_c, out_h, out_w)
+    return out
+
+
+def deform_conv2d_wrapper(
+    x,
+    offset,
+    weight,
+    mask=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    deformable_groups=1,
+    groups=1,
+    im2col_step=1,
+):
+    return paddle.vision.ops.deform_conv2d(
+        x,
+        offset,
+        weight,
+        None,
+        stride,
+        padding,
+        dilation,
+        deformable_groups,
+        groups,
+        mask,
+    )
+
+
+class TestModulatedDeformableConvOp(OpTest):
+    def setUp(self):
+        self.python_api = deform_conv2d_wrapper
+        self.op_type = "deformable_conv"
+        self.init_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv_param = {
+            "stride": self.stride,
+            "pad": self.pad,
+            "dilation": self.dilations,
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        offset = 10 * np.random.random(self.offset_size).astype(self.dtype)
+        mask = 10 * np.random.random(self.mask_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
+
+        output = dconv_im2col_gemm(input, offset, mask, filter, self.groups, conv_param)
+        output = output.astype(self.dtype)
+
+        self.inputs = {
+            "Input": OpTest.np_dtype_to_base_dtype(input),
+            "Offset": OpTest.np_dtype_to_base_dtype(offset),
+            "Mask": OpTest.np_dtype_to_base_dtype(mask),
+            "Filter": OpTest.np_dtype_to_base_dtype(filter),
+        }
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "groups": self.groups,
+            "deformable_groups": self.deformable_groups,
+            "im2col_step": self.im2col_step,
+            "dilations": self.dilations,
+        }
+        self.outputs = {"Output": output}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(
+            {"Input", "Offset", "Mask", "Filter"},
+            "Output",
+            max_relative_error=0.05,
+            check_pir=True,
+        )
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 8, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_type(self):
+        self.dtype = np.float32
+
+
+class TestWithStride(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [3, 3]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithDilation(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [1, 1]
+        self.input_size = [4, 3, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+
+class TestWith3x3(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithGroup(TestModulatedDeformableConvOp):
+    def init_group(self):
+        self.groups = 2
+
+
+class TestWithDouble(TestModulatedDeformableConvOp):
+    def init_type(self):
+        self.dtype = np.float64
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 6, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestModulatedDeformableConvInvalidInput(unittest.TestCase):
+    def test_error_api(self):
+        def test_invalid_input():
+            paddle.enable_static()
+            input = [1, 3, 32, 32]
+            offset = paddle.static.data(
+                name="offset", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            mask = paddle.static.data(
+                name="mask", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            loss = paddle.vision.ops.DeformConv2D(
+                in_channels=input[1], out_channels=4, kernel_size=1
+            )(input, offset, mask)
+
+        self.assertRaises(TypeError, test_invalid_input)
+
+        def test_invalid_offset():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input", shape=[None, 3, 32, 32], dtype="int32"
+            )
+            offset = paddle.static.data(
+                name="offset", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            mask = paddle.static.data(
+                name="mask", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            loss = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1], out_channels=4, kernel_size=1
+            )(input, offset, mask)
+
+        self.assertRaises(TypeError, test_invalid_offset)
+
+        def test_invalid_groups():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input_groups", shape=[1, 1, 1, 1], dtype="float32"
+            )
+            offset = paddle.static.data(
+                name="offset_groups", shape=[1, 1], dtype="float32"
+            )
+            mask = paddle.static.data(name="mask_groups", shape=[1], dtype="float32")
+            loss = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1],
+                out_channels=1,
+                kernel_size=1,
+                padding=1,
+                groups=0,
+            )(input, offset, mask)
+
+        self.assertRaises(ZeroDivisionError, test_invalid_groups)
+
+
+class TestDeformConv2DAPI(unittest.TestCase):
+    def test_api(self):
+        def test_deform_conv2d_v1():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input_v1", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            offset = paddle.static.data(
+                name="offset_v1", shape=[None, 4, 32, 32], dtype="float32"
+            )
+            out = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1], out_channels=4, kernel_size=1
+            )(input, offset, None)
+            assert tuple(out.shape) == (-1, 4, 32, 32)
+
+        test_deform_conv2d_v1()
+
+        def test_deform_conv2d_v2():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input_v2", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            offset = paddle.static.data(
+                name="offset_v2", shape=[None, 4, 32, 32], dtype="float32"
+            )
+            mask = paddle.static.data(
+                name="mask_v2", shape=[None, 2, 32, 32], dtype="float32"
+            )
+            out = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1], out_channels=4, kernel_size=1
+            )(input, offset, mask)
+
+            assert tuple(out.shape) == (-1, 4, 32, 32)
+
+        test_deform_conv2d_v2()
+
+
+class TestModulatedDeformableConvOp_ZeroSize(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        # 0-size
+        self.input_size = [0, 8, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestDeformConv2DAPI_CPU_FP16(unittest.TestCase):
+    def setUp(self):
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.data_format = "NCL"
+
+    def test_cpu_fp16(self):
+        with paddle.base.dygraph.guard(paddle.CPUPlace()):
+            x = paddle.ones([4, 5, 5, 5])
+            offset = paddle.ones([4, 90, 5, 5]).astype(paddle.float16)
+            weight = paddle.ones([5, 5, 3, 3]).astype(paddle.float16)
+            bias = paddle.ones([5]).astype(paddle.float16)
+            mask = paddle.ones([4, 45, 5, 5]).astype(paddle.float16)
+
+            # If there is an error, an error will be thrown.
+            out = paddle.vision.ops.deform_conv2d(
+                x,
+                offset,
+                weight,
+                bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=5,
+                mask=mask,
+            )
+            np.testing.assert_allclose(out.shape, [4, 5, 5, 5])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_deformable_conv_v1_op_metax.py b/backends/metax_gpu/tests/unit_test/test_deformable_conv_v1_op_metax.py
new file mode 100644
index 00000000000..6a4244db267
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_deformable_conv_v1_op_metax.py
@@ -0,0 +1,319 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from itertools import product
+
+import numpy as np
+from op_test import OpTest
+
+import paddle
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+
+def dmc_bilinear(data_im, height, width, h, w):
+    h_low = int(np.floor(h))
+    w_low = int(np.floor(w))
+    h_high = h_low + 1
+    w_high = w_low + 1
+
+    lh = h - h_low
+    lw = w - w_low
+    hh = 1 - lh
+    hw = 1 - lw
+
+    v1 = 0
+    if h_low >= 0 and w_low >= 0:
+        v1 = data_im[h_low, w_low]
+    v2 = 0
+    if h_low >= 0 and w_high <= width - 1:
+        v2 = data_im[h_low, w_high]
+    v3 = 0
+    if h_high <= height - 1 and w_low >= 0:
+        v3 = data_im[h_high, w_low]
+    v4 = 0
+    if h_high <= height - 1 and w_high <= width - 1:
+        v4 = data_im[h_high, w_high]
+
+    w1, w2, w3, w4 = hh * hw, hh * lw, lh * hw, lh * lw
+    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+
+    return val
+
+
+def dconv_im2col_gemm(input, offset, filter, group, conv_param):
+    in_n, in_c, in_h, in_w = input.shape
+    out_c, f_c, f_h, f_w = filter.shape
+
+    assert offset.shape == (in_n, 2 * f_h * f_w, in_h, in_w)
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+
+    stride, pad, dilation = (
+        conv_param["stride"],
+        conv_param["pad"],
+        conv_param["dilation"],
+    )
+    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
+    assert out_h == in_h
+    assert out_w == in_w
+
+    col_buffer = np.zeros((in_n, in_c * f_h * f_w, in_h * in_w))
+    for n, c, h, w, kh, kw in product(
+        range(in_n),
+        range(in_c),
+        range(out_h),
+        range(out_w),
+        range(f_h),
+        range(f_w),
+    ):
+        offset_h_table = offset[n, ::2, h, w].reshape(f_h, f_w)
+        offset_w_table = offset[n, 1::2, h, w].reshape(f_h, f_w)
+        offset_h = offset_h_table[kh, kw]
+        offset_w = offset_w_table[kh, kw]
+        val = 0
+        im_h = h * stride[0] + kh * dilation[0] + offset_h - pad[0]
+        im_w = w * stride[0] + kw * dilation[0] + offset_w - pad[1]
+        if im_h > -1 and im_w > -1 and im_h < in_h and im_w < in_h:
+            val = dmc_bilinear(input[n, c], in_h, in_w, im_h, im_w)
+        val_out = val
+
+        col_buffer[n, c * f_h * f_w + kh * f_w + kw, h * in_w + w] = val_out
+
+    out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
+    weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
+    col_buffer = col_buffer.reshape(
+        (in_n, group, int(in_c // group * f_h * f_w), in_h * in_w)
+    )
+    for n in range(in_n):
+        for g in range(group):
+            out[n, g] = np.matmul(weight[g], col_buffer[n, g])
+    out = out.reshape(in_n, out_c, out_h, out_w)
+    return out
+
+
+def deform_conv2d_wrapper(
+    x,
+    offset,
+    weight,
+    mask=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    deformable_groups=1,
+    groups=1,
+    im2col_step=1,
+):
+    return paddle.vision.ops.deform_conv2d(
+        x,
+        offset,
+        weight,
+        None,
+        stride,
+        padding,
+        dilation,
+        deformable_groups,
+        groups,
+        mask,
+    )
+
+
+class TestModulatedDeformableConvOp(OpTest):
+    def setUp(self):
+        self.python_api = deform_conv2d_wrapper
+        self.op_type = "deformable_conv_v1"
+        self.init_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv_param = {
+            "stride": self.stride,
+            "pad": self.pad,
+            "dilation": self.dilations,
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        offset = 10 * np.random.random(self.offset_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
+
+        output = dconv_im2col_gemm(input, offset, filter, self.groups, conv_param)
+        output = output.astype(self.dtype)
+        self.inputs = {
+            "Input": OpTest.np_dtype_to_base_dtype(input),
+            "Offset": OpTest.np_dtype_to_base_dtype(offset),
+            "Filter": OpTest.np_dtype_to_base_dtype(filter),
+        }
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "groups": self.groups,
+            "deformable_groups": self.deformable_groups,
+            "im2col_step": self.im2col_step,
+            "dilations": self.dilations,
+        }
+        self.outputs = {"Output": output}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["Input", "Offset", "Filter"],
+            "Output",
+            max_relative_error=0.05,
+            check_pir=True,
+        )
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(
+            ["Input", "Offset"],
+            "Output",
+            max_relative_error=0.1,
+            no_grad_set={"Filter"},
+            check_pir=True,
+        )
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 4, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_type(self):
+        self.dtype = np.float32
+
+
+class TestWithStride(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [3, 3]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithDilation(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [1, 1]
+        self.input_size = [5, 3, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+
+class TestWith1x1(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [40, f_c, 1, 1]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithGroup(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 8, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_group(self):
+        self.groups = 2
+
+
+class TestWithDouble(TestModulatedDeformableConvOp):
+    def init_type(self):
+        self.dtype = np.float64
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_einsum_0d_tensor_metax.py b/backends/metax_gpu/tests/unit_test/test_einsum_0d_tensor_metax.py
new file mode 100644
index 00000000000..f3f3bb30e34
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_einsum_0d_tensor_metax.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+from numpy.testing import assert_allclose
+
+import paddle
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
+
+
+class Test0DCase0(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+    def tearDown(self):
+        paddle.enable_static()
+
+    def test_func(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        y = paddle.rand([])
+        y.stop_gradient = False
+        z = paddle.einsum("...,...->...", x, y)
+        assert_allclose(
+            z.numpy(),
+            np.einsum("...,...->...", x.numpy(), y.numpy()),
+            atol=1e-6,
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == []
+        assert y.grad.shape == []
+
+
+class Test0DCase1(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("...,ij->...", x, y)
+        assert_allclose(
+            z.numpy(), np.einsum("...,ij->...", x.numpy(), y.numpy()), atol=1e-6
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == []
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase2(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = False
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("ij,ij->", x, y)
+        assert_allclose(
+            z.numpy(), np.einsum("ij,ij->", x.numpy(), y.numpy()), atol=1e-6
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase3(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = True
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("ij,ij->", x, y)
+        assert_allclose(
+            z.numpy(), np.einsum("ij,ij->", x.numpy(), y.numpy()), atol=1e-6
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad is None
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase4(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        z = paddle.einsum("...->...", x)
+        assert_allclose(z.numpy(), np.einsum("...->...", x.numpy()), atol=1e-6)
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == []
+        assert x.grad.numpy() == 1.0
+
+
+class Test0DCase5(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = False
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("i...j, i...j->...", x, y)
+        assert_allclose(
+            z.numpy(),
+            np.einsum("i...j, i...j->...", x.numpy(), y.numpy()),
+            atol=1e-6,
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase6(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = False
+        z = paddle.einsum("ij->", x)
+        assert_allclose(z.numpy(), np.einsum("ij->", x.numpy()), atol=1e-6)
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == [2, 2]
+
+
+class Test0DCase7(Test0DCase0):
+    def test_func(self):
+        """
+        3 operands.
+        """
+        x = paddle.rand([2, 2])
+        y = paddle.rand([])
+        z = paddle.rand([])
+        x.stop_gradient = False
+        y.stop_gradient = False
+        z.stop_gradient = False
+        o = paddle.einsum("ij...,...,...->...", x, y, z)
+        assert_allclose(
+            o.numpy(),
+            np.einsum("ij...,...,...->...", x.numpy(), y.numpy(), z.numpy()),
+            atol=1e-6,
+        )
+        o.mean().backward()
+        assert o.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == []
+        assert z.grad.shape == []
+
+
+class Test0DCase8(Test0DCase0):
+    def test_func(self):
+        """
+        3 operands.
+        """
+        x = paddle.rand([2, 2])
+        y = paddle.rand([])
+        z = paddle.rand([])
+        e = paddle.rand([3, 1])
+        x.stop_gradient = False
+        y.stop_gradient = False
+        z.stop_gradient = False
+        e.stop_gradient = False
+        o = paddle.einsum("ij...,...,..., km->...", x, y, z, e)
+        assert_allclose(
+            o.numpy(),
+            np.einsum(
+                "ij...,...,...,km->...",
+                x.numpy(),
+                y.numpy(),
+                z.numpy(),
+                e.numpy(),
+            ),
+            atol=1e-6,
+        )
+        o.mean().backward()
+        assert o.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == []
+        assert z.grad.shape == []
+        assert e.grad.shape == [3, 1]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_fc_op_metax.py b/backends/metax_gpu/tests/unit_test/test_fc_op_metax.py
new file mode 100644
index 00000000000..67afd71c5f9
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_fc_op_metax.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+SEED = 2020
+
+
+def fc_refer(matrix, with_bias, with_relu=False):
+    in_n, in_c, in_h, in_w = matrix.input.shape
+    w_i, w_o = matrix.weights.shape
+
+    x_data = np.reshape(matrix.input, [in_n, in_c * in_h * in_w])
+    w_data = np.reshape(matrix.weights, [w_i, w_o])
+    b_data = np.reshape(matrix.bias, [1, w_o])
+    result = None
+
+    if with_bias:
+        result = np.dot(x_data, w_data) + b_data
+    else:
+        result = np.dot(x_data, w_data)
+
+    if with_relu:
+        return np.maximum(result, 0)
+    else:
+        return result
+
+
+class MatrixGenerate:
+    def __init__(self, mb, ic, oc, h, w, bias_dims=2):
+        self.input = np.random.random((mb, ic, h, w)).astype("float32")
+        self.weights = np.random.random((ic * h * w, oc)).astype("float32")
+        if bias_dims == 2:
+            self.bias = np.random.random((1, oc)).astype("float32")
+        else:
+            self.bias = np.random.random(oc).astype("float32")
+
+
+class TestFCOp(OpTest):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 10, 15, 3, 3, 2)
+
+    def setUp(self):
+        self.op_type = "fc"
+        self.config()
+
+        if self.with_bias:
+            self.inputs = {
+                "Input": self.matrix.input,
+                "W": self.matrix.weights,
+                "Bias": self.matrix.bias,
+            }
+        else:
+            self.inputs = {"Input": self.matrix.input, "W": self.matrix.weights}
+
+        if self.with_relu:
+            activation_type = "relu"
+        else:
+            activation_type = ""
+        self.attrs = {"use_onednn": False, "activation_type": activation_type}
+
+        self.outputs = {"Out": fc_refer(self.matrix, self.with_bias, self.with_relu)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestFCOpNoBias1(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(2, 8, 10, 1, 1, 2)
+
+
+class TestFCOpNoBias2(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
+
+
+class TestFCOpNoBias4(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(1, 32, 64, 3, 3, 1)
+
+
+class TestFCOpWithBias1(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = False
+        self.matrix = MatrixGenerate(3, 8, 10, 2, 1, 2)
+
+
+class TestFCOpWithBias2(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
+
+
+class TestFCOpWithBias3(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 64, 32, 3, 3, 1)
+
+
+class TestFCOpWithPadding(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 4, 3, 128, 128, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_imperative_double_grad_metax.py b/backends/metax_gpu/tests/unit_test/test_imperative_double_grad_metax.py
new file mode 100644
index 00000000000..803b00cc6b4
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_imperative_double_grad_metax.py
@@ -0,0 +1,1106 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from unittest import TestCase
+
+import numpy as np
+from op_test import get_device, is_custom_device
+
+import paddle
+import paddle.nn.functional as F
+from paddle import base
+from paddle.base.wrapped_decorator import wrap_decorator
+from paddle.vision.models import resnet50, resnet101
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+core.set_cublas_switch(False)
+
+
+def _dygraph_guard_(func):
+    def __impl__(*args, **kwargs):
+        if base.in_dygraph_mode():
+            return func(*args, **kwargs)
+        else:
+            with base.dygraph.guard():
+                return func(*args, **kwargs)
+
+    return __impl__
+
+
+dygraph_guard = wrap_decorator(_dygraph_guard_)
+
+
+def random_var(size, low=-1, high=1, dtype="float32"):
+    x_np = np.random.uniform(low=low, high=high, size=size).astype(dtype)
+    return paddle.to_tensor(x_np)
+
+
+class TestEagerGrad(TestCase):
+    def test_simple_example_eager_grad(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        out = paddle.matmul(x, y)
+        dx = base.dygraph.grad(out, x)
+
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        np.testing.assert_allclose(dx[0].numpy(), expected_dx, rtol=1e-05)
+
+    def test_simple_example_eager_grad_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        dx = base.dygraph.grad(out, [x, z], allow_unused=True)
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+        np.testing.assert_allclose(dx[0].numpy(), expected_dx, rtol=1e-05)
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        # x is unused input in the graph
+        self.assertIsNone(dx[1])
+
+    def test_simple_example_eager_grad_not_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # allow_unused is false in default
+            dx = base.dygraph.grad(out, [x, z])
+        except ValueError as e:
+            error_msg = str(e)
+            assert error_msg.find("allow_unused") > 0
+
+    def test_simple_example_eager_grad_duplicate_input(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # duplicate input will arise RuntimeError errors
+            dx = base.dygraph.grad(out, [x, x])
+        except RuntimeError as e:
+            error_msg = str(e)
+            assert error_msg.find("duplicate") > 0
+
+    def test_simple_example_eager_grad_duplicate_output(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # duplicate output will arise RuntimeError errors
+            dx = base.dygraph.grad([out, out], [x])
+        except RuntimeError as e:
+            error_msg = str(e)
+            assert error_msg.find("duplicate") > 0
+
+    def test_simple_example_eager_two_grad_output(self):
+        x1 = paddle.to_tensor([1.0, 2.0])
+        x1.stop_gradient = False
+        x2 = paddle.to_tensor([1.0, 2.0])
+        x2.stop_gradient = False
+        out1 = x1 * 2
+        out2 = x2 * 2
+
+        dout2_record_by_hook = []
+
+        def record_hook(grad):
+            dout2_record_by_hook.append(grad)
+
+        out2.register_hook(record_hook)
+
+        out3 = paddle.multiply(out1, out2)
+        out4 = paddle.mean(out3)
+        egr_dout2, egr_dout3 = paddle.grad([out4], [out2, out3])
+
+        np.testing.assert_array_equal(
+            dout2_record_by_hook[0].numpy(), np.array([1.0, 2.0])
+        )
+
+        x1 = paddle.to_tensor([1.0, 2.0])
+        x1.stop_gradient = False
+        x2 = paddle.to_tensor([1.0, 2.0])
+        x2.stop_gradient = False
+        out1 = x1 * 2
+        out2 = x2 * 2
+
+        out3 = paddle.multiply(out1, out2)
+        out4 = paddle.mean(out3)
+        dout2, dout3 = paddle.grad([out4], [out2, out3])
+
+        self.assertEqual(dout2.stop_gradient, egr_dout2.stop_gradient)
+        self.assertEqual(dout3.stop_gradient, egr_dout3.stop_gradient)
+        np.testing.assert_array_equal(dout2.numpy(), egr_dout2.numpy())
+        np.testing.assert_array_equal(dout3.numpy(), egr_dout3.numpy())
+
+
+class TestDygraphDoubleGrad(TestCase):
+    def setUp(self):
+        self.sort_sum_gradient = False
+        self.shape = [5, 10]
+
+    def grad(
+        self,
+        outputs,
+        inputs,
+        grad_outputs=None,
+        no_grad_vars=None,
+        retain_graph=None,
+        create_graph=False,
+        allow_unused=False,
+    ):
+        base.set_flags({"FLAGS_sort_sum_gradient": self.sort_sum_gradient})
+        return base.dygraph.grad(
+            outputs=outputs,
+            inputs=inputs,
+            grad_outputs=grad_outputs,
+            no_grad_vars=no_grad_vars,
+            retain_graph=retain_graph,
+            create_graph=create_graph,
+            allow_unused=allow_unused,
+        )
+
+    @dygraph_guard
+    def test_exception(self):
+        with self.assertRaises(AssertionError):
+            self.grad(None, None)
+
+        shape = self.shape
+
+        with self.assertRaises(AssertionError):
+            self.grad(1, random_var(shape))
+
+        with self.assertRaises(AssertionError):
+            self.grad(random_var(shape), 1)
+
+        with self.assertRaises(AssertionError):
+            self.grad([1], [random_var(shape)])
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape)], [1])
+
+        with self.assertRaises(AssertionError):
+            self.grad(
+                [random_var(shape), random_var(shape)],
+                [random_var(shape)],
+                [random_var(shape)],
+            )
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=[1])
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
+
+    @dygraph_guard
+    def test_simple_example(self):
+        x = random_var(self.shape)
+        x.stop_gradient = False
+        y = x + 1
+
+        for create_graph in [False, True]:
+            (dx,) = self.grad([x], [x], create_graph=create_graph, retain_graph=True)
+            self.assertEqual(dx.shape, x.shape)
+            self.assertTrue(np.all(dx.numpy() == 1))
+            self.assertNotEqual(dx.stop_gradient, create_graph)
+
+            (dx_mul_2,) = self.grad(
+                [y, x], [x], create_graph=create_graph, retain_graph=True
+            )
+            self.assertEqual(dx_mul_2.shape, x.shape)
+            self.assertTrue(np.all(dx_mul_2.numpy() == 2))
+            self.assertNotEqual(dx_mul_2.stop_gradient, create_graph)
+
+            (none_grad,) = self.grad(
+                [x], [y], create_graph=create_graph, allow_unused=True
+            )
+            self.assertIsNone(none_grad)
+
+            (grad_with_none_and_not_none,) = self.grad(
+                [x, y], [y], create_graph=create_graph
+            )
+            self.assertTrue(grad_with_none_and_not_none.shape, x.shape)
+            self.assertTrue(np.all(grad_with_none_and_not_none.numpy() == 1))
+            self.assertNotEqual(grad_with_none_and_not_none.stop_gradient, create_graph)
+
+    @dygraph_guard
+    def test_example_no_grad_vars(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y1 = F.relu(x)
+        y2 = F.relu(x)
+        z = y1 + y2
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y1, z, w
+
+        (dx_actual,) = self.grad([w_mean], [x], create_graph=True, no_grad_vars=[y2])
+
+        self.assertFalse(y2.stop_gradient)
+        self.assertFalse(dx_actual.stop_gradient)
+
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) * (x_np > 0) * 2
+        ).astype("float32")
+
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+    @dygraph_guard
+    def test_none_one_initial_gradient(self):
+        numel = 1
+        for s in self.shape:
+            numel *= s
+
+        half_numel = int(numel / 2)
+        half_x_positive = np.random.uniform(low=1, high=2, size=[half_numel])
+        half_x_negative = np.random.uniform(low=-2, high=-1, size=[numel - half_numel])
+        x_np = np.array(list(half_x_positive) + list(half_x_negative)).astype("float32")
+        np.random.shuffle(x_np)
+
+        x = paddle.to_tensor(x_np)
+        x.stop_gradient = False
+
+        alpha = 0.2
+        y = paddle.nn.functional.leaky_relu(x, alpha)
+        y = y * y
+        z = y * y
+
+        x_np = x.numpy()
+        relu_x_np = np.maximum(x_np, alpha * x_np).astype("float32")
+        relu_x_grad_np = ((x_np > 0) + (x_np < 0) * alpha).astype("float32")
+        dy_expected = (relu_x_np * relu_x_grad_np * 2).astype("float32")
+        dz_expected = (np.power(relu_x_np, 3) * relu_x_grad_np * 4).astype("float32")
+
+        random_grad_y = random_var(y.shape, low=1, high=2)
+        random_grad_z = random_var(z.shape, low=1, high=2)
+        ones_grad_y = np.ones(y.shape).astype("float32")
+        ones_grad_z = np.ones(z.shape).astype("float32")
+
+        original_random_grad_y = random_grad_y.numpy()
+        original_random_grad_z = random_grad_z.numpy()
+
+        for grad_y in [random_grad_y]:
+            for grad_z in [random_grad_z]:
+                for create_graph in [False, True]:
+                    (dx_actual,) = self.grad(
+                        outputs=[y, z],
+                        inputs=[x],
+                        grad_outputs=[grad_y, grad_z],
+                        create_graph=create_graph,
+                        retain_graph=True,
+                    )
+
+                    grad_y_np = ones_grad_y if grad_y is None else grad_y.numpy()
+                    grad_z_np = ones_grad_z if grad_z is None else grad_z.numpy()
+
+                    dx_expected = dy_expected * grad_y_np + dz_expected * grad_z_np
+                    np.testing.assert_allclose(
+                        dx_actual.numpy(), dx_expected, rtol=1e-05
+                    )
+
+                    if grad_y is not None:
+                        self.assertTrue(grad_y.stop_gradient)
+                        np.testing.assert_array_equal(
+                            grad_y.numpy(), original_random_grad_y
+                        )
+
+                    if grad_z is not None:
+                        self.assertTrue(grad_z.stop_gradient)
+                        np.testing.assert_array_equal(
+                            grad_z.numpy(), original_random_grad_z
+                        )
+
+    @dygraph_guard
+    def test_example_with_gradient_accumulation_and_create_graph(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y = F.relu(x)
+        z = y + 1
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y, z, w
+
+        (dx_actual,) = self.grad([w_mean], [x], create_graph=True)
+        del w_mean
+
+        self.assertFalse(dx_actual.stop_gradient)
+
+        # Theoretical result based on math calculation
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + 1) * (x_np > 0) * 2
+        ).astype("float32")
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
+        loss.backward(retain_graph=True)
+
+        x_grad_actual = x.gradient()
+        x_grad_expected = (
+            2.0 / float(numel) * (x_np + dx_expected * (x_np > 0) * 2 / float(numel))
+        ).astype("float32")
+        np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+        for i in range(5):
+            loss.backward(retain_graph=True)
+            x_grad_actual = x.gradient()
+            x_grad_expected = (i + 2) * (
+                2.0
+                / float(numel)
+                * (x_np + dx_expected * (x_np > 0) * 2 / float(numel))
+            ).astype("float32")
+            np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+    @dygraph_guard
+    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y1 = F.relu(x)
+        y2 = F.relu(x)
+        z = y1 + y2
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y1, z, w
+
+        (dx_actual,) = self.grad(
+            [w_mean],
+            [x],
+            retain_graph=True,
+            create_graph=True,
+            no_grad_vars=[y2],
+        )
+
+        self.assertFalse(y2.stop_gradient)
+        self.assertFalse(dx_actual.stop_gradient)
+
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) * (x_np > 0) * 2
+        ).astype("float32")
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
+        loss.backward()
+
+        x_grad_actual = x.gradient()
+        x_grad_expected = (
+            2.0 / float(numel) * (x_np + dx_expected * (x_np > 0) * 4 / float(numel))
+        ).astype("float32")
+        np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+    @dygraph_guard
+    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y = F.relu(x)
+        z = y + 1
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y, z, w
+
+        (dx_actual,) = self.grad([w_mean], [x], create_graph=False)
+        del w_mean
+
+        self.assertTrue(dx_actual.stop_gradient)
+
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + 1) * (x_np > 0) * 2
+        ).astype("float32")
+
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
+        loss.backward()
+
+        x_grad_actual = x.gradient()
+        x_grad_expected = (2.0 * x_np / float(numel)).astype("float32")
+        np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+
+class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
+    def setUp(self):
+        self.sort_sum_gradient = True
+        self.shape = [5, 10]
+
+
+class TestDygraphDoubleGradVisitedUniq(TestCase):
+    def test_compare(self):
+        value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2, 5).astype("float32")
+
+        def model_f(input):
+            linear = paddle.nn.Linear(5, 3)
+            for i in range(10):
+                if i == 0:
+                    out = linear(input)
+                else:
+                    out = out + linear(input)
+            return out
+
+        base.set_flags({"FLAGS_sort_sum_gradient": True})
+
+        with base.dygraph.guard():
+            paddle.seed(123)
+            if paddle.framework.use_pir_api():
+                with paddle.pir_utils.OldIrGuard():
+                    # Note: dygraph use self.main_program.global_block().create_parameter(), it's need manual seed to old Program
+                    paddle.framework.random._manual_program_seed(123)
+                paddle.framework.random._manual_program_seed(123)
+            else:
+                paddle.framework.random._manual_program_seed(123)
+            a = paddle.to_tensor(value)
+            a.stop_gradient = False
+
+            out = model_f(a)
+
+            dx = base.dygraph.grad(
+                outputs=[out],
+                inputs=[a],
+                create_graph=False,
+                only_inputs=True,
+                allow_unused=False,
+            )
+
+            grad_1 = dx[0].numpy()
+
+        with base.dygraph.guard():
+            paddle.seed(123)
+            if paddle.framework.use_pir_api():
+                with paddle.pir_utils.OldIrGuard():
+                    # Note: dygraph use self.main_program.global_block().create_parameter(), it's need manual seed to old Program
+                    paddle.framework.random._manual_program_seed(123)
+                paddle.framework.random._manual_program_seed(123)
+            else:
+                paddle.framework.random._manual_program_seed(123)
+            a = paddle.to_tensor(value)
+            a.stop_gradient = False
+
+            out = model_f(a)
+            out.backward()
+
+            grad_2 = a.gradient()
+
+        np.testing.assert_array_equal(grad_1, grad_2)
+
+
+class TestDoubleGradResNet(TestCase):
+    def setUp(self):
+        paddle.seed(123)
+        if paddle.framework.use_pir_api():
+            with paddle.pir_utils.OldIrGuard():
+                # Note: dygraph use self.main_program.global_block().create_parameter(), it's need manual seed to old Program
+                paddle.framework.random._manual_program_seed(123)
+            paddle.framework.random._manual_program_seed(123)
+        else:
+            paddle.framework.random._manual_program_seed(123)
+        self.data = np.random.rand(1, 3, 224, 224).astype(np.float32)
+
+    @dygraph_guard
+    def test_resnet_resnet50(self):
+        model = resnet50(pretrained=False)
+        egr_data = paddle.to_tensor(self.data)
+        egr_data.stop_gradient = False
+        egr_out = model(egr_data)
+        egr_preds = paddle.argmax(egr_out, axis=1)
+        egr_label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(egr_preds), num_classes=egr_out.shape[1]
+        )
+        egr_target = paddle.sum(egr_out * egr_label_onehot, axis=1)
+
+        egr_g = paddle.grad(outputs=egr_target, inputs=egr_out)[0]
+        egr_g_numpy = egr_g.numpy()
+        self.assertEqual(list(egr_g_numpy.shape), list(egr_out.shape))
+
+        model = resnet50(pretrained=False)
+        data = paddle.to_tensor(self.data)
+        data.stop_gradient = False
+        out = model(data)
+        preds = paddle.argmax(out, axis=1)
+        label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(preds), num_classes=out.shape[1]
+        )
+        target = paddle.sum(out * label_onehot, axis=1)
+
+        g = paddle.grad(outputs=target, inputs=out)[0]
+        g_numpy = g.numpy()
+        self.assertEqual(list(g_numpy.shape), list(out.shape))
+
+        np.testing.assert_array_equal(egr_out, out)
+        np.testing.assert_array_equal(egr_g_numpy, g_numpy)
+
+    @dygraph_guard
+    def test_resnet_resnet101(self):
+        model = resnet101(pretrained=False)
+        egr_data = paddle.to_tensor(self.data)
+        egr_data.stop_gradient = False
+        egr_out = model(egr_data)
+        egr_preds = paddle.argmax(egr_out, axis=1)
+        egr_label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(egr_preds), num_classes=egr_out.shape[1]
+        )
+        egr_target = paddle.sum(egr_out * egr_label_onehot, axis=1)
+
+        egr_g = paddle.grad(outputs=egr_target, inputs=egr_out)[0]
+        egr_g_numpy = egr_g.numpy()
+        self.assertEqual(list(egr_g_numpy.shape), list(egr_out.shape))
+
+        model = resnet101(pretrained=False)
+        data = paddle.to_tensor(self.data)
+        data.stop_gradient = False
+        out = model(data)
+        preds = paddle.argmax(out, axis=1)
+        label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(preds), num_classes=out.shape[1]
+        )
+        target = paddle.sum(out * label_onehot, axis=1)
+
+        g = paddle.grad(outputs=target, inputs=out)[0]
+        g_numpy = g.numpy()
+        self.assertEqual(list(g_numpy.shape), list(out.shape))
+
+        np.testing.assert_array_equal(egr_out, out)
+        np.testing.assert_array_equal(egr_g_numpy, g_numpy)
+
+
+class TestDoubleGradBasics(TestCase):
+    def test_matmul(self):
+        input_numpy = np.ones([3, 3]) * 2
+        x = paddle.to_tensor(input_numpy, stop_gradient=False, dtype="float32")
+        y = paddle.to_tensor(input_numpy, stop_gradient=False, dtype="float32")
+        grad_out = paddle.to_tensor(
+            np.ones([3, 3]), stop_gradient=False, dtype="float32"
+        )
+
+        out = paddle.matmul(x, y, False, False)
+        new_x_g, new_y_g = paddle.grad(
+            [out], [x, y], [grad_out], retain_graph=True, create_graph=True
+        )
+        new_x_g.backward()
+
+        out_ref = np.ones([3, 3]) * 12.0
+        np.testing.assert_array_equal(out.numpy(), out_ref)
+
+        new_x_g_ref = np.ones([3, 3]) * 6.0
+        new_y_g_ref = np.ones([3, 3]) * 6.0
+        np.testing.assert_array_equal(new_x_g.numpy(), new_x_g_ref)
+        np.testing.assert_array_equal(new_y_g.numpy(), new_y_g_ref)
+
+        x_grad_ref = np.ones([3, 3]) * 0.0
+        np.testing.assert_array_equal(x.grad.numpy(), x_grad_ref)
+
+        y_grad_ref = np.ones([3, 3]) * 3.0
+        np.testing.assert_array_equal(y.grad.numpy(), y_grad_ref)
+
+        grad_out_grad_ref = np.ones([3, 3]) * 6.0
+        np.testing.assert_array_equal(grad_out.grad.numpy(), grad_out_grad_ref)
+
+
+class TestDygraphDoubleGradMatmul(TestCase):
+    # case1: ddy is none, no broadcast,dims != 1
+    def test_matmul_double_grad_case1(self):
+        input_numpy_x = np.random.random([3, 3]).astype("float32")
+        input_numpy_y = np.random.random([3, 3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            (dx, dy) = paddle.grad(
+                [out], [x, y], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            ddy = ddx
+            dx_double_grad, dy_double_grad, ddout = paddle.grad(
+                [dx, dy],
+                [x, y, dout],
+                [ddx, ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, dy_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"),
+                np.ones([3, 3], dtype="float32"),
+            )
+            dy_double_grad_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"),
+                np.ones([3, 3], dtype="float32"),
+            )
+            ddout_expected1 = np.matmul(np.ones([3, 3], dtype="float32"), input_numpy_y)
+            ddout_expected2 = np.matmul(input_numpy_x, np.ones([3, 3], dtype="float32"))
+            ddout_expected = ddout_expected1 + ddout_expected2
+            return (
+                dx_double_grad_expected,
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case2: ddx is none,no broadcast, dims != 1
+    def test_matmul_double_grad_case2(self):
+        input_numpy_x = np.random.random([3, 3]).astype("float32")
+        input_numpy_y = np.random.random([3, 3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            # when x isnot be differentiate in first grad dy in second grad could be None in composite op
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"),
+                np.ones([3, 3], dtype="float32"),
+            )
+            ddout_expected = np.matmul(input_numpy_x, np.ones([3, 3], dtype="float32"))
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case3: ddx is none, dims = 1
+    def test_matmul_double_grad_case3(self):
+        input_numpy_x = np.random.random([3]).astype("float32")
+        input_numpy_y = np.random.random([3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([1]), stop_gradient=False, dtype="float32")
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(np.ones([3]), stop_gradient=False, dtype="float32")
+            # when x is not be differentiate in first grad, dy from second grad could be None in composite api.
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(input_numpy_x, np.ones([3], dtype="float32"))
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case4: ddy is none, dims = 1
+    def test_matmul_double_grad_case4(self):
+        input_numpy_x = np.random.random([3]).astype("float32")
+        input_numpy_y = np.random.random([3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([1]), stop_gradient=False, dtype="float32")
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(np.ones([3]), stop_gradient=False, dtype="float32")
+            # when y is not be differentiate in first grad, dx from second grad could be None in composite api.
+            dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dy_double_grad, ddout
+
+        def expected():
+            dy_double_grad_expected = np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(input_numpy_y, np.ones([3], dtype="float32"))
+            return (
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case5: ddx is none, broadcast, dims != 1
+    def test_matmul_double_grad_case5(self):
+        input_numpy_x = np.random.random([2, 1]).astype("float32")
+        input_numpy_y = np.random.random([1]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([2]), stop_gradient=False, dtype="float32")
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(np.ones([1]), stop_gradient=False, dtype="float32")
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([2, 1], dtype="float32")
+            ddout_expected = np.matmul(input_numpy_x, np.ones([1], dtype="float32"))
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case6: ddy is none, broadcast, dims != 1
+    def test_matmul_double_grad_case6(self):
+        input_numpy_x = np.random.random([2, 1]).astype("float32")
+        input_numpy_y = np.random.random([1]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([2]), stop_gradient=False, dtype="float32")
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([2, 1]), stop_gradient=False, dtype="float32"
+            )
+            dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dy_double_grad, ddout
+
+        def expected():
+            dy_double_grad_expected = np.ones([1], dtype="float32") * 2
+            ddout_expected = np.ones([2], dtype="float32") * input_numpy_y[0]
+            return (
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # TODO(Ruting) test complex dtype when composite api support
+    """
+    # case7: ddx is none, dims = 1, complex dtype
+    def test_matmul_double_grad_case7(self):
+        input_numpy_x = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y_conj = np.conjugate(input_numpy_y)
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='complex64'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='complex64'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([1]), stop_gradient=False, dtype='complex64'
+            )
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([3]), stop_gradient=False, dtype='complex64'
+            )
+            # when y is not be differentiate in first grad, dx from second grad could be None in composite api.
+            dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dy_double_grad, ddout
+
+        def expected():
+            dy_double_grad_expected = np.ones(
+                [3], dtype="float32"
+            ) + 0j * np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_y_conj, np.ones([3], dtype="float32")
+            )
+            return (
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if (paddle.is_compiled_with_cuda() or is_custom_device()):
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+
+
+    # case8: ddy is none, dims = 1, complex dtype
+    def test_matmul_double_grad_case8(self):
+        input_numpy_x = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_x_conj = np.conjugate(input_numpy_x)
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='complex64'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='complex64'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([1]), stop_gradient=False, dtype='complex64'
+            )
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(
+                np.ones([3]), stop_gradient=False, dtype='complex64'
+            )
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_x_conj, np.ones([3], dtype="float32")
+            )
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if (paddle.is_compiled_with_cuda() or is_custom_device()):
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+    """
+
+    def test_value_error(self):
+        def test():
+            import paddle
+            from paddle import nn
+
+            model = nn.Sequential(nn.Linear(3, 4))
+
+            x = paddle.randn([4, 1])
+            y = paddle.randn([4, 1])
+            z = paddle.randn([4, 1])
+            x.stop_gradient = False
+            y.stop_gradient = False
+            z.stop_gradient = False
+            out = model(paddle.concat((x, y, z), axis=1))
+
+            data = {
+                "x": x,
+                "y": y,
+                "z": z,
+                "u": out[:, 0:1],
+                "v": out[:, 1:2],
+                "w": out[:, 2:3],
+                "p": out[:, 3:4],
+            }
+
+            v = out[:, 1:2]
+            z = paddle.grad(v, x, create_graph=True)[0]
+            zz = paddle.grad(z, x, create_graph=True)[0]
+
+        with self.assertRaises(ValueError):
+            test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_linalg_matrix_exp_metax.py b/backends/metax_gpu/tests/unit_test/test_linalg_matrix_exp_metax.py
new file mode 100644
index 00000000000..e39de09d6e4
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_linalg_matrix_exp_metax.py
@@ -0,0 +1,268 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+
+import numpy as np
+import scipy
+from op_test import get_places
+
+import paddle
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
+
+if sys.platform == "win32":
+    RTOL = {"float32": 1e-02, "float64": 1e-04}
+    ATOL = {"float32": 1e-02, "float64": 1e-04}
+elif sys.platform == "darwin":
+    RTOL = {"float32": 1e-06, "float64": 1e-12}
+    ATOL = {"float32": 1e-06, "float64": 1e-12}
+elif scipy.__version__ < "1.15":
+    RTOL = {"float32": 1e-06, "float64": 1e-15}
+    ATOL = {"float32": 1e-06, "float64": 1e-15}
+else:
+    RTOL = {"float32": 1e-06, "float64": 1e-13}
+    ATOL = {"float32": 1e-06, "float64": 1e-13}
+
+
+class MatrixExpTestCase(unittest.TestCase):
+    def setUp(self):
+        self.init_config()
+        self.generate_input()
+        self.generate_output()
+        self.places = get_places()
+
+    def generate_input(self):
+        self._input_shape = (5, 5)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+    def generate_output(self):
+        self._output_data = scipy.linalg.expm(self._input_data)
+
+    def init_config(self):
+        self.dtype = "float64"
+
+    def test_dygraph(self):
+        for place in self.places:
+            paddle.disable_static(place)
+            x = paddle.to_tensor(self._input_data, place=place)
+            out = paddle.linalg.matrix_exp(x).numpy()
+
+            np.testing.assert_allclose(
+                out,
+                self._output_data,
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    # TODO(megemini): cond/while_loop should be tested in pir
+    #
+    def test_static(self):
+        paddle.enable_static()
+
+        for place in get_places():
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
+                x = paddle.static.data(
+                    name="input",
+                    shape=self._input_shape,
+                    dtype=self._input_data.dtype,
+                )
+
+                out = paddle.linalg.matrix_exp(x)
+                exe = paddle.static.Executor(place)
+
+                res = exe.run(
+                    feed={"input": self._input_data},
+                    fetch_list=[out],
+                )[0]
+
+            np.testing.assert_allclose(
+                res,
+                self._output_data,
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    def test_grad(self):
+        for place in self.places:
+            x = paddle.to_tensor(self._input_data, place=place, stop_gradient=False)
+            out = paddle.linalg.matrix_exp(x)
+            out.backward()
+            x_grad = x.grad
+
+            self.assertEqual(list(x_grad.shape), list(x.shape))
+            self.assertEqual(x_grad.dtype, x.dtype)
+
+
+class MatrixExpTestCaseFloat32(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCase3D(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = (2, 5, 5)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCase3DFloat32(MatrixExpTestCase3D):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCase4D(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = (2, 3, 5, 5)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCase4DFloat32(MatrixExpTestCase4D):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCaseEmpty(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = ()
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCaseEmptyFloat32(MatrixExpTestCaseEmpty):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCaseScalar(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = (2, 3, 1, 1)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCaseScalarFloat32(MatrixExpTestCaseScalar):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+# test precision for float32 with l1_norm comparing `conds`
+class MatrixExpTestCasePrecisionFloat32L1norm0(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.2], [-0.2, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat32L1norm1(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.8], [-0.8, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat32L1norm2(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 2.0], [-2.0, 0]]).astype(self.dtype)
+
+
+# test precision for float64 with l1_norm comparing `conds`
+class MatrixExpTestCasePrecisionFloat64L1norm0(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.01], [-0.01, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm1(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.1], [-0.1, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm2(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.5], [-0.5, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm3(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 1.5], [-1.5, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm4(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 2.5], [-2.5, 0]]).astype(self.dtype)
+
+
+# test error cases
+class MatrixExpTestCaseError(unittest.TestCase):
+    def test_error_dtype(self):
+        with self.assertRaises(ValueError):
+            x = np.array(123, dtype=int)
+            paddle.linalg.matrix_exp(x)
+
+    def test_error_ndim(self):
+        # 1-d
+        with self.assertRaises(ValueError):
+            x = np.random.rand(1)
+            paddle.linalg.matrix_exp(x)
+
+        # not square
+        with self.assertRaises(ValueError):
+            x = np.random.rand(3, 4)
+            paddle.linalg.matrix_exp(x)
+
+        with self.assertRaises(ValueError):
+            x = np.random.rand(2, 3, 4)
+            paddle.linalg.matrix_exp(x)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 07b41e0823c0dc588b3bc048d18c97059cae56e2 Mon Sep 17 00:00:00 2001
From: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Date: Thu, 16 Oct 2025 13:48:11 +0800
Subject: [PATCH 72/95] [metax] support wint4 in quantize (#103)

---
 .../weight_quantize_kernel_register.cu            | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index 4e2a4ce240c..44ac7f2fddc 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -115,11 +115,12 @@ void WeightQuantizeKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(scale);
     weight_quant_gpu<T, Context>(dev_ctx,
                                  x.data<T>(),
-                                 quanted_x.data<int8_t>(),
+                                 out->data<int8_t>(),
                                  scale->data<T>(),
                                  weight_shape,
                                  arch,
                                  algo);
+    out->Resize({m, n});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});
@@ -133,12 +134,12 @@ void WeightQuantizeKernel(const Context& dev_ctx,
     funcs::Transpose<Context, int8_t, 2> trans;
     trans(dev_ctx, x_int_tmp, out, axis);
 #else
-    weight_permute_gpu<Context>(dev_ctx,
-                                quanted_x.data<int8_t>(),
-                                out->data<int8_t>(),
-                                weight_shape,
-                                arch,
-                                algo);
+    // weight_permute_gpu<Context>(dev_ctx,
+    //                             quanted_x.data<int8_t>(),
+    //                             out->data<int8_t>(),
+    //                             weight_shape,
+    //                             arch,
+    //                             algo);
 #endif
   } else if (algo == "w4a8") {
     weight_permute_gpu_w4a8<Context>(dev_ctx,

From 581a9e2824fa38aeec47e3c158b51d4d988821c3 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 16 Oct 2025 14:30:35 +0800
Subject: [PATCH 73/95] updata_metax (#104)

* test

* test

---------
---
 .github/workflows/metax_work.yaml | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index fd7d04c0843..360846846c2 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -5,12 +5,6 @@ on:
   pull_request:
     types: [opened, synchronize]
     branches: [develop, release/**]
-    paths:
-      - "**"
-      - "Paddle/**"
-      - "!backends/**"
-      - "backends/metax_gpu/**"
-
 permissions: read-all
 
 defaults:
@@ -34,18 +28,33 @@ jobs:
             --jobs=8 \
             --branch ${{ github.base_ref || github.ref_name}} \
             --recurse-submodules \
-            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
+            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/PaddlePaddle/PaddleCustomDevice.git .
 
           if [ "${{ github.event_name }}" == "pull_request" ]; then
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
             git checkout pull/${{ github.event.pull_request.number }}/head
 
-            git submodule update --init --recursive
+            paddle_branch=${{ github.base_ref || github.ref_name}}
+            change_numbers=$(git diff --name-only remotes/origin/${paddle_branch} | wc -l)
+            change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/"| wc -l)
+            change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/metax_gpu"| wc -l)
+            git diff --name-only remotes/origin/${paddle_branch}
+
+            if [ $change_numbers -ne $change_backend ]; then
+              echo "Common file changed, continue to run metax FULL CI test ..."
+            elif [ $paddle_branch -eq 0 ] ; then
+              echo "NO metax backend changes found, skip metax FULL CI ...."
+              exit 0
+            fi
+
+
+            # git submodule update --init --recursive
           fi
 
 
       - name: compile
         run: |
+          sleep 10000
           cd backends/metax_gpu
           bash build.sh
 

From 4ab7f5456a2bb339a667b1c117fe7fbf281c118e Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 16 Oct 2025 14:56:32 +0800
Subject: [PATCH 74/95] updata_metax (#105)

* chang_meatx_yaml

* chang_meatx_yaml

* updata_metax

* test

* test

* test

* test

---------
---
 .github/workflows/metax_work.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 360846846c2..bdedcaa7c8e 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -54,7 +54,7 @@ jobs:
 
       - name: compile
         run: |
-          sleep 10000
+          # sleep 10000
           cd backends/metax_gpu
           bash build.sh
 

From ef5306d1032ff492091ebdff47bae64c526eafb6 Mon Sep 17 00:00:00 2001
From: ZhouDuan <136539532+1184319564@users.noreply.github.com>
Date: Thu, 16 Oct 2025 17:09:38 +0800
Subject: [PATCH 75/95] add one test to metax (#107)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* fix some tests

* add one test

---------

Co-authored-by: sw <1640472053@qq.com>
Co-authored-by: duqimeng <77875733+duqimeng@users.noreply.github.com>
Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
---
 .../test_fused_conv2d_add_act_op_metax.py     | 429 ++++++++++++++++++
 1 file changed, 429 insertions(+)
 create mode 100644 backends/metax_gpu/tests/unit_test/test_fused_conv2d_add_act_op_metax.py

diff --git a/backends/metax_gpu/tests/unit_test/test_fused_conv2d_add_act_op_metax.py b/backends/metax_gpu/tests/unit_test/test_fused_conv2d_add_act_op_metax.py
new file mode 100644
index 00000000000..2b405a76367
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_fused_conv2d_add_act_op_metax.py
@@ -0,0 +1,429 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest, get_device_place, is_custom_device
+from test_conv2d_op import conv2d_forward_naive
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+def create_test_padding_SAME_class(parent):
+    class TestPaddingSAMECase(parent):
+        def init_paddings(self):
+            self.pad = [0, 0]
+            self.padding_algorithm = "SAME"
+
+    cls_name = "{}_{}".format(parent.__name__, "PaddingSAMEOp")
+    TestPaddingSAMECase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingSAMECase
+
+
+def create_test_padding_VALID_class(parent):
+    class TestPaddingVALIDCase(parent):
+        def init_paddings(self):
+            self.pad = [1, 1]
+            self.padding_algorithm = "VALID"
+
+    cls_name = "{}_{}".format(parent.__name__, "PaddingVALIDOp")
+    TestPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingVALIDCase
+
+
+def create_test_cudnn_channel_last_class(parent):
+    @unittest.skipIf(
+        not (core.is_compiled_with_cuda() or is_custom_device()),
+        "core is not compiled with CUDA",
+    )
+    class TestCudnnChannelLastCase(parent):
+        def init_test_case(self):
+            super().init_test_case()
+            self.data_format = "NHWC"
+            N, C, H, W = self.input_size
+            self.input_size = [N, H, W, C]
+            K1, K2, R, S = self.filter_size
+            self.filter_size = [K1, R, S, K2]
+
+        def test_check_output(self):
+            print(self.attrs)
+            if self.has_cuda():
+                place = get_device_place()
+                self.check_output_with_place(place, atol=1e-5, check_dygraph=False)
+
+    cls_name = "{}_{}".format(parent.__name__, "CudnnChannelLast")
+    TestCudnnChannelLastCase.__name__ = cls_name
+    globals()[cls_name] = TestCudnnChannelLastCase
+
+
+class TestFusedConv2dAddActOp(OpTest):
+    def setUp(self):
+        self.op_type = "fused_conv2d_add_act"
+        self.exhaustive_search = False
+        self.data_format = "NCHW"
+        self.dtype = np.float32
+        self.activation = "relu"
+        self.add_residual_data = True
+        self.split_channels = None
+        self.outputs = None
+        self.padding_algorithm = "EXIPLICIT"
+
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+        self.init_residual()
+        self.init_activation()
+        self.init_paddings()
+        self.set_search_method()
+
+        conv2d_param = {
+            "stride": self.stride,
+            "pad": self.pad,
+            "dilation": self.dilations,
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
+        bias = np.random.random(self.filter_size[0]).astype(self.dtype)
+
+        if self.data_format == "NHWC":
+            filter_nchw = np.transpose(filter, [0, 3, 1, 2])
+        else:
+            filter_nchw = filter
+
+        self.output, _, _, _, _ = conv2d_forward_naive(
+            input,
+            filter_nchw,
+            self.groups,
+            conv2d_param,
+            self.padding_algorithm,
+            self.data_format,
+        )
+
+        self.output = self.output.astype(self.dtype)
+
+        self.inputs = {
+            "Input": OpTest.np_dtype_to_base_dtype(input),
+            "Filter": OpTest.np_dtype_to_base_dtype(filter),
+            "Bias": OpTest.np_dtype_to_base_dtype(bias),
+        }
+
+        if self.add_residual_data:
+            residual_data = np.random.random(self.output.shape).astype(self.dtype)
+            self.inputs["ResidualData"] = OpTest.np_dtype_to_base_dtype(residual_data)
+            self.output += residual_data
+
+        # Add bias
+        if self.data_format == "NCHW":
+            self.output = self.output + bias.reshape((1, bias.size, 1, 1))
+        else:
+            self.output = self.output + bias.reshape((1, 1, 1, bias.size))
+
+        assert self.activation in ["relu", "identity"]
+        if self.activation == "relu":
+            self.output = np.maximum(self.output, 0)
+
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "groups": self.groups,
+            "dilations": self.dilations,
+            "data_format": self.data_format,
+            "exhaustive_search": self.exhaustive_search,
+            "activation": self.activation,
+            "padding_algorithm": self.padding_algorithm,
+        }
+        if self.split_channels is not None:
+            self.attrs["split_channels"] = self.split_channels
+
+        self.outputs = {"Output": self.output}
+
+        self.set_outputs()
+
+    def has_cuda(self):
+        return core.is_compiled_with_cuda() or is_custom_device()
+
+    def test_check_output(self):
+        if self.has_cuda():
+            place = get_device_place()
+            self.check_output_with_place(place, atol=1e-5, check_dygraph=False)
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_residual(self):
+        self.add_residual_data = True
+
+    def init_activation(self):
+        self.activation = "relu"
+
+    def set_search_method(self):
+        self.exhaustive_search = False
+
+    def set_outputs(self):
+        pass
+
+    def init_paddings(self):
+        self.pad = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithoutResidual(TestFusedConv2dAddActOp):
+    def init_residual(self):
+        self.add_residual_data = False
+
+
+class TestIdentityActivation(TestFusedConv2dAddActOp):
+    def init_activation(self):
+        self.activation = "identity"
+
+
+class TestIdentityActivation1(TestFusedConv2dAddActOp):
+    def init_activation(self):
+        self.activation = "identity"
+        self.add_residual_data = False
+
+
+class TestWithGroup(TestFusedConv2dAddActOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithDilation(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestCUDNNExhaustiveSearch(TestFusedConv2dAddActOp):
+    def set_search_method(self):
+        self.exhaustive_search = True
+
+
+class TestMultipleOutputs(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [1, 32, 17, 17]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [126, f_c, 3, 3]
+        self.split_channels = [84, 42]
+
+    def set_outputs(self):
+        out1 = self.output[:, 0:84, :, :]
+        out2 = self.output[:, 84:126, :, :]
+        self.outputs["Outputs"] = [("out1", out1), ("out2", out2)]
+
+
+class TestAsyPadding(TestFusedConv2dAddActOp):
+    def init_paddings(self):
+        self.pad = [0, 0, 1, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithPad_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithStride_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWith1x1_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [2, 2, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithGroup_AsyPadding(TestFusedConv2dAddActOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithDepthWise3x3_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [3, 4, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [8, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 4
+
+    def init_paddings(self):
+        self.pad = [1, 3, 2, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise5x5_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 4, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [8, f_c, 5, 5]
+
+    def init_group(self):
+        self.groups = 4
+
+    def init_paddings(self):
+        self.pad = [0, 1, 1, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise7x7_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 8, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [16, f_c, 7, 7]
+
+    def init_group(self):
+        self.groups = 8
+
+    def init_paddings(self):
+        self.pad = [1, 3, 4, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDilation_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [0, 1, 3, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithInput1x1Filter1x1_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [0, 3, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestSimpleNHWC(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [3, 5, 5, 2]  # NHWC
+        self.data_format = "NHWC"
+        assert np.mod(self.input_size[3], self.groups) == 0
+        f_c = self.input_size[3] // self.groups
+        self.filter_size = [4, 3, 3, f_c]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_paddings(self):
+        self.pad = [1, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+create_test_padding_SAME_class(TestAsyPadding)
+create_test_padding_SAME_class(TestWithPad_AsyPadding)
+create_test_padding_SAME_class(TestWithStride_AsyPadding)
+create_test_padding_SAME_class(TestWithGroup_AsyPadding)
+create_test_padding_SAME_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+create_test_padding_VALID_class(TestAsyPadding)
+create_test_padding_VALID_class(TestWithPad_AsyPadding)
+create_test_padding_VALID_class(TestWithStride_AsyPadding)
+create_test_padding_VALID_class(TestWithGroup_AsyPadding)
+create_test_padding_VALID_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+create_test_cudnn_channel_last_class(TestAsyPadding)
+create_test_cudnn_channel_last_class(TestWithPad_AsyPadding)
+create_test_cudnn_channel_last_class(TestWithStride_AsyPadding)
+create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
+create_test_cudnn_channel_last_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+if __name__ == "__main__":
+    unittest.main()

From 027c099c99074b172495f51d21db4504cd810d41 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Thu, 16 Oct 2025 18:55:57 +0800
Subject: [PATCH 76/95] uodata_metax (#106)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_warpctc.cmake

* change warpctc.cmake

* test

* change_run_ut

* remove_tets

* test

* add_generate_pb

* [metax]fix paddle bug

* change_ut

* change_ut

* change_ut

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

* [metax] rm file

* [metax] rm file

* [metax] add Rules

* [metax] change_patch

* update paddle

* [metax] fix dot error

* [metax]rm opt path and fix activation_kernel bug

* updata paddle

* chang_meatx_yaml

* chang_meatx_yaml

* updata_metax

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .github/workflows/metax_work.yaml | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index bdedcaa7c8e..353cbb098b6 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -28,22 +28,38 @@ jobs:
             --jobs=8 \
             --branch ${{ github.base_ref || github.ref_name}} \
             --recurse-submodules \
-            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/PaddlePaddle/PaddleCustomDevice.git .
+            https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git .
 
           if [ "${{ github.event_name }}" == "pull_request" ]; then
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
             git checkout pull/${{ github.event.pull_request.number }}/head
 
+
+
+
             paddle_branch=${{ github.base_ref || github.ref_name}}
+            echo $paddle_branch
+            # sleep 10000
             change_numbers=$(git diff --name-only remotes/origin/${paddle_branch} | wc -l)
-            change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/"| wc -l)
-            change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/metax_gpu"| wc -l)
+            echo $change_numbers
+
+
+            change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/" || true)
+            echo $change_backend
+            change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/metax_gpu" || true)
+            echo $change_metax_only
+
+            # change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/"| wc -l)
+            # echo $change_backend
+            # change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/metax_gpu"| wc -l)
+            # echo $change_metax_only
+
             git diff --name-only remotes/origin/${paddle_branch}
 
             if [ $change_numbers -ne $change_backend ]; then
               echo "Common file changed, continue to run metax FULL CI test ..."
             elif [ $paddle_branch -eq 0 ] ; then
-              echo "NO metax backend changes found, skip metax FULL CI ...."
+              echo "NO metax backend changes found, skip metax FULL CI ....."
               exit 0
             fi
 
@@ -59,6 +75,7 @@ jobs:
           bash build.sh
 
       - name: run test
+
         run: |
           cd backends/metax_gpu/tests
           bash run_test.sh -j 16

From b08a8630a3b1fafbc768b3cb109e8ab9cceaabae Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Fri, 17 Oct 2025 10:23:08 +0800
Subject: [PATCH 77/95] updata eigen_and fix_bug (#109)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_warpctc.cmake

* change warpctc.cmake

* test

* change_run_ut

* remove_tets

* test

* add_generate_pb

* [metax]fix paddle bug

* change_ut

* change_ut

* change_ut

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

* [metax] rm file

* [metax] rm file

* [metax] add Rules

* [metax] change_patch

* update paddle

* [metax] fix dot error

* [metax]rm opt path and fix activation_kernel bug

* updata paddle

* chang_meatx_yaml

* chang_meatx_yaml

* updata_metax

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* updata_enigen

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 .../patch/mcEigen_3.4.0_paddle_final.zip      | Bin 3747604 -> 3747549 bytes
 backends/metax_gpu/tests/ignore.txt           |   7 +++++++
 2 files changed, 7 insertions(+)

diff --git a/backends/metax_gpu/patch/mcEigen_3.4.0_paddle_final.zip b/backends/metax_gpu/patch/mcEigen_3.4.0_paddle_final.zip
index d4835abc3517e181bec2093f8cd2977c8b69cd0d..69d962f113256a866c015001b4c2453213e6c22c 100644
GIT binary patch
delta 92073
zcmZTRbzD^0_q<_dn3?z91RVl1LkuATDj}#~H`s}aEf%<TA$B7c*A~}Z8^wIK*x0zP
zU8rjZ=-T-^@1Cpc=lkOiIo^BvoO^D(`yTUe|HIw7^$(|S+Q?q%Y$yNJO>NR)jy_5m
zCjYMYMHvad$*oLH=)A&~>YsQTy9&_0Z8EppO)o?Js%-CAk+ZX#r6ML*E@;7qDDAdP
zkJ=pVj5-ip?+$dfQ2yxDZ&Q0c$Ie<WLrSd*JG<#>GSEdKQyR9#c$ZfwseoR-H60Mj
z4-8WwbUU<}n(|GV_u`{gX=nFNM=S+83n<8MORMOqM6@Z<TkR357DwC>y4rj<C(4Wu
ziRzV8K&4uQ8>F*tD{Q%rLyZ2d^BLsRb_s*_w`+yEo>*V=Z8iu@G0Yq2Ya^)+QL1f?
zJN%)f+Uy_p(i*C^vzu)Cp^aL`5Vl38MyZi`pPn{*id!o`!|8*)om~TGGVqyb*;bh5
z#<afCf3S|C#9vj`8w`G_uOfGQa6$8$j#6)%F?=@ji_fU}sM2Fg{lafLJG-7?q)Q=1
zQ*C=O+Q_3oZt4c+gMt}0X3dS6^EqXN9M#9Zy3l!fPj&A9ML!(cX&TUf(2%^Rj{h5X
zZgyoBj4JbgKm!*IV{OluS7T7v9}O5Za$R2rJ=tVp(6?>>sHm;nry1@$z^};y;noNi
z(8+^{j^MiJJqMXzJ|5VCxC3{E*3zl$>?#-*Gz&%42&X9bZTe%*i12Ab;iUjT$Tkqu
zhmfBbF5AYQie`d$pI@!g%ceFgcIo~M1P>O-xJw+Q_#Fe$yF`iGwqJUyRLU6J@~>ZK
z(2P4{RM;jw=%EtjUdQ8W26=(3E`&Q#*}5(7DVR&CUX=cg5>7QbF{BM()H2M=;cO(a
zkW*4A8SYUI+l=qJAfM!d)WQ+q@?z5E0dJj`_xVrChGccMg_0aKXZ#r=jh)QDRfi|f
zyFvQq+RMoix*&toZM*Pg7>B|ee|v_|j(D!3g0@Km72IeBzEW}-95uhF-~8;`Ahsz-
zUNu2B<Y2idEy-7NkCASX1NRt_rw#u*{2!3%^q)-WsFwSHqNGBe8_Q9rJZsy)qYET=
zZL(r9S6<eQyzq(gl~#&eUwh<gG;vW(n+8tY9!zS<!Ig`~a7#5jxk{cCmlz7UsNBFP
z)h{_HyFe<e%w1!$mql~?7}znE3ui)d;<;^1NWJQuj{}NMtHa%AxJ~PE2@2|n*bQ@z
zZG^#gPW>}VAktO3-GJN4SS)PJHDYv+nsApfC!|Boxp*~Yyt5s52Kz~Id#(uzvvlAT
zOs=gfw~;A!r3d#`gT@sP;F?nnIyQ*6`~eyqSCKgfc?-!aA7v#up;5e)na*uuIz;@&
zX_??^L%C#RM{-7REm7R#5nNT)AC$>mwx^s!roZhq8N#tiL$V%Mun_YK&Yd(3i*k?}
zkLT9@$D<Z0n9e!;ua}!n<~$h7{?j=a*VOl$W^?Tr$S>fgFfey9*Fi~T9uTK29|z;=
zx|R%dRbkqb#08uS$*L43NR3u-GZ@qBtGKTWys?&ZcEHM1zh!tRDq1LUYErd;)1lyo
z1>9sbYvrk{hwFh^1JtC;247~Dj4gR@=Texb$4Hz?sh5jgzkrdWf<U9y8gj^4`~{ul
z#YE|(YkRqhN|X)xRvV#`<8CnezH<grc#2c~G6ZFY!mgIhW^3*2T2CW~_6wGucv|UE
zA-6<Jg&v&SwW<|_HsWbrqc{@qs);(ayf`)X6PR$?kwkXoLP=SCl#Xn!5~U}}aZwtn
z!+B1Lh%tA*ZQ2b(ZyzMby<cRh$kn(gy;N|KJHm;wQl9hEqF)1IS7NDTg8ccRJLwf4
z<tP<h;=W+%mx``%hwahIg5MNzxz(c7<lGJLvdl%gd4ubyLa^FhE}PMuzsHp*{-Ytq
z&$wR~1eB6{$jwnv+ZzmRk!o_VvwP!AYV3AJ|42Ym6+<i;|K$?wDdyml$*D6T^#_7=
zW+U@tSz?sC^yfP+7bQvwA2|a@(cM~<l<y2qEhi9DHbjydDeFQ!Yeu<A>ppR5ERq%K
z>1h01|0i@u54@Q-0|aJa>PW!hfP(TKDI6N^0RFSHCl}_!D1;tYa<&%uc%?#NLira~
zS5JQk72Vev6OZC_svYGd>6D5x=C*HIg&O@UX+*^iX2c<b;xq$u92FOtGk>V*>{%)N
z$4soTlim40ipuIZlRfpKTqK>VVu6B&eEQbvQM19ADP+!uiZtZTq-p&qqx9Ncv4{zq
z?WJg-Wuf&~*rVGO`o-_M0HI3tGdZ1!x?z+_suZN?i~g80D0e_#!0Gxc&O#aIb%<gJ
z<K8z+agLeNrG+An42w|cF-@jLC=Q|6q)0`@|Cbj(!zUexR>ZQr_*6|1#WM51@t0Ls
z@oW8;eM7QpC~l&d)S8NuC?>3y!Ue6HRP;Eu4n*lU<4@}h60qsP0`#PjVlXQ<pmcp?
zl;1)z3K^AesSsHFs<u(2tFeD`M@3_rC*72>ZNEdFG}j>ak`_^JWK>JYljKecE%QrC
zs^UJTOplfYn^FKhLm@9SB1mbAC=JPL6=jg-cUAOc!|Pi&#a;&P@1bbKa#$Xnq;4OD
z5jAMvM{yoC4C||4qfs&qP%QZmy&+<L`%O_ybq(p1yY&Rb@S~be@+7NW6i=45hXUg{
zSaE^rS~^sbiHjp?AgmL#bonTan51yQ#q9)=8#Wjgw?+?n<trVTrkIUu@Vr}>--<kx
zxN^kQ)+(>T)zXw4g&7y1(uO69PD<QE&@?yiR=AR+JcW+7Nhx`XCL(s{=PMp#<DyN9
z?+R>Ox<#=9p@gl9WeAmQRXjuJ{5HiK3P~rnD?Tbjd8<<Y<b-z-Fo2WE-7L{c5<5qE
zkcQo$q`K}>#OY;>JC)uh#sNlKJJQS->IX%+-7U(6<nL0{rYNROmQ(woF#oP47q%;6
zNb-J#R!Z2dIBk!D_4^geQE<V2MMaL%)SX!~<SNii(qWCJmIn8P*{;$-#XUyoaYWII
zs+lI8PkszRd9a{qsFkE%P?$+_FBn$mjwoI$D8H_gnw)fig}qcoUguI2$#6np)lvEn
zk+<r3fDs;~MjbE{7$i}uyWLK<oCldU*d8a|jI9bI@m*oBlfGV1tmRPYbr%(prxCop
z+&=jTf{|U7Mm1%=X>je6FG0CdvNAyJM+*BuDW29_`3-|2&FHJ-7^j?mN}ho$`YY`@
zO78x-@9`a=+XM2-33Q{5DCw_sATI|fkJ9zNUzLAdwt%nl1hS<8lvzM=kRGQi&*NrA
z+B-<;s-T$X8((uXLEa5@leafE;+YO(%z3EN3$yP=`D?FAK+~%Z<W-i+I?tS{B1_U?
zJ|8ez`IN0HQ^qR4;vQ1!I7NAdWqIr@<y1^<J72TsW0<IHe)8_rMzW`Xss#gK<>fF}
z`I=FFU8H=ATek%Rvuj?5xEzY2dq9eP@HY?{yIlE?4#$moopL5KrC_5nQB568vK}cT
zN$OA-`eECYYyiY>Q%=X+kWTJYLhX_>ba9HRNg>Q|4+L^Z<4nhtOy?>E?@^w`(IkC3
zpd4$&z>PVtT*F#}{#NFoG-=;;<qfu&UUEly*WsrQwUFw>BV`hk?D0&QpvBgdH_9d~
z%=zz>uvwAykp_NH{*JM|UpB02n(T10#b&XR0zsvhntoD_U^HjHD1Yav4LX-khHVhy
z;qG$DgT1n&%2PU^Q5``OZ~j{}Y6S3!66HLNCh3kUUMdz;*_iyNigsiUfbKByKWdVf
z8KsuC8C1iWuy99JWoCCTSCxT9YNWTSwZc>p)P@xN`C}%p=BoUDSqI2=Jzm%JtS|W4
zM~`V(;16?Q<#FKXwq{kZlG;&Go%UK*^st(&46<1yz=0(R^-4pV>XHgabFr^a4;Xb*
z$lVI6aB^Tgq(hG|RXTIu)e5T344e_68lzCk+K~gHmQa$FsM3<eN-7;~laeZ_>_xrY
z{rR(E&qheIdllu(4w0rMs`fLahK*EL6{3ukGWgFciF!Lbj&!-E3YFR?sha7L(dHD@
z7!JXRPO7nL$}_#$v|}G3(<9x9B_hy*!o#OTsif!KRcZ~zu@3m~C<o-8vB`2nrO^JW
zJSO-2FqN|wc|OWgUEnE5_RUnSrj)DvGp%#Ln|VCdnD|bIEyl`ODwvw6y@_*Gc@%N*
zgwC5HVN3@)p=s{(R5jJex6>k31r35ZD^)j`;Pj0uEfYL?hw2DVxg7HKh&e0R*#*TT
zmtlugw*@3^cTV-0k+!|2nuRVQ&Gvg!f5WNFW7V(w99cUl{IN>S99HFtstjW!g*{iD
zRw2_buT)#m5^2jj)f^>NLh<*Dk34wvU^ubdQwI{`Jg|ApXO-CjiB>Aq7nxGS4eBwh
z^`onL35(Oh0QE$B#5D)2zcSDgrmo9m^^H^~U@O^FS$z&sv!c{bQCYOA_=bb{OK!^s
z4gYI|Lt{tS9*v<VOhiT@iHjiCuVU2g9Fb2_L-l@Uw4t%OV?4HIOi@o`tzW0Azp&P?
zN7V~i>!Rc8kzR=Fc2B*Qqq!J*Zll+^SUbD(Br@G0oHSh$<v@C+JGe<6|ELegBKDnf
z4(XghR&P)TkIDNZ;k1!(WR8p0jQRnQN}BECa4Hh<J2rJV$7F^#cWB6Dj_L2PhP4Kz
zJMdKyH)gHF0M@#0ox_Jn8lR47eVTOxFAPF2NZ^I>j<hGF=D?8QFl2RVIdT=G?3WHR
z8M#lH!{B&|G^LkqO#&dfDiP^isHO$#-&#nzs;_Eix5QpS^_QB4X~L?aCgajHQJk_8
zSV{Ku(ZuRyC^}|QId^%+bCORvE#%HBs4_8qHCB6yD0Lj5iBloCWRS+so)Tpp?eXj*
z)Sll(TA!pBLp44O-!M{hN{KSyY&IS#%Z=4kW&J7RH73+S8aP1%ha<8QQsg8}iUPqo
zQ#1-Dkj&J;GLRzlo2?niz^8LG<xzs<nX75&jR?KBX{s}X^*c02IjZNCn1q8GaFa@b
zxmF@*b);k?%%EE(jU$s?v`h2Bm!hj~#=mb4OU6-^U<S1i8C&{zRg=ck-ajsODzkyn
z{k8w~F8SxD)LRcVj~r2#qK_IFFjVi8Up4KSH8y+gCf0gQtxZ!>rrCjQRDPhLprBQp
zWNd-$+C*L(h6cPiH0fGCbSK%<y}Se}UBDjo>2_ZxL9NPi!t!6D*Z!`iEcxFKzjK88
zb0O?MqoutrTAM(-m%Mnr>N2q8R49>J>1EYEVeZiQYyZ-s^0&jZ(;3(=M(e|zo*l1k
z$V|GKsO{-QxekAtd8VCcXV)xNUaO$*$n2f4CYsh#+f77%WnHu*1q25S(8jQEt;^8D
zlqP#?&yB-pJs}q_Y5wytKSUeJyp}jz>*zocyRNyFse({ss(&Jqg56P~v~aA}z}RjW
zr(KLDkp>gAg;XpNCu#@l<p&5o=Dct5!rjiUS}N?-%`kgT)T%t_qXlXAGHr?hL63ZG
zQ+ova7ih~@KycY@?NL9<rPh-wgM(panWLe53@JyUtEoh!%WB<lXNq)tRC3L|;Bela
ziXwodktTWRPII*T(Sd%4nnHJd1)ck(vEI7A9OXe?`sgIoTYk(z@*<!nT=&)Kuni8$
z6VREVX5A;&7#ylg#qL?5x<u^05UR^V2jAJ6x-<av&GshTdryl59GfbYmBAnMK?*5r
zuT%e0uH~52|0`twK3J`|!c<-eft69n@T0JAfu-ef6lLh(1!qKZ$L!-#yxV_A1u}zv
zhkf6AgMH^nvs~Vf<ki#}rNhxWR*0m}F}fi-QAS)pZp7PIh<++5c*OfjfY`}iFY9J(
zux!XpP@ySDqy1A5sa{w29JPlnTq6oeW9sR`&?IS7Bi$<GkoC&uKL_%qshX}^QCD*j
zoJHg&>;6z6W%cGd1&8eDj0|!_GP(u0TUJ7v-a;4SK=rxbZvT)J@M2$BB0htCL`Dmp
zKyIIc30B!b*NX{H@2oq_QS>);eyh6_&?g(@F$`OxGf^JW$S%6oO4>cXkJrn6Fp#yd
zPnAbbY8RcG)U&7Vgc{{W_SYRzA^71pT>w+oGDJ7ah0;I0Sk_wy>>I)MZU?ZJwUM4q
z*99PZON+dze}VJnaX8$mRgGrr?lJB;y3Y2ZY+w_o)&+X-(PsI<Q@C_yp3YBUii3`y
zOH}}T$(&5y`pZ!)bxlOBuAPYdX06f%GY{-pqid(ZR<{D(0oHnLyRMNQTZ4}1IyoRX
z{gmziv#jx1T@o{W`9)n7X8Mh*x^4~zSz5;nZq<z);j|x~4RnG1JSjWJE2V)Yx`2+z
zYjHI`oH00Ci=T`kmD<$d8#5KwG~h4WBks5+{BDL&J((ZQLcOOQ|JkgJ24T300SO|t
z8J3^$6D4<iAIbMo%8=B56hDCzWiWTQ@nI!Mbv-Gb&%;XN7My3Cx(OTYT4VTXh%$Hl
z!M4=_C6eT=6aq=c6y88mvUou{m&rHgkfBc&4+r|ldmNw1z|!%23r;V$`t=*A2mu}|
zVTPBFC1v$wVQRAUPL#8>a3Wus(G*YO`!le{6n+r{Z%^f6nL_zbo6Z*?NUF``?@=oh
z$|u|AgP8}&mMy5Fmb|<N13qmQznqd0KAV3;Rk`})V1fj=o)8Q9tXozk_n)Y8q~UD-
z5Q;G5@GhKImR705Qg4YQ&*htvF?0Bn*xhh0@5_|wKaX$Cz!UR%*d@qBq<${{fy&x4
zU~K1fkTu6nKH3T(i3@pel0*10$jG#i|BF#STExTAFr^-`guj4L(o%i{Vt!o;(V*d#
zsxRjoD-g`i<MnC;^H%fq?J@q(*YWp7ITM=i`1AU1kRC~l*<v6uZv^2bFCf4n8~7tg
z_G$w!a(Wrr&8eqVj`E+1pKj5UuP>v#q}KU-eZ=V+F(Ba)F#H1ROcq6oV<1mA@t=|7
zlg)fXre4fe9?s)r=F-Y-d>sZp-402Ctu=P?;mBE=|H%6wa8BXq0ZrnoC@-={;_o2d
zvR!gYQAWOd_|Xhpv6qiw;PriS`cZ`71AKc1PB|!N6>U9O$eTFolvBr#{dEZ>?L(*J
ze2DTVw!{2=#M2+)n<8GvW;+WP0^SQ^Y6i}W_!t#Hb{^$_N4%6{d<J7(e4KCfLv|70
z4(-|ZG5B5?P%q=?b5l9VT*=r^QATp|bCiKRJjs8eUT?cQ*h8MGQaC)={|~S47kGY=
zaGF1k0<~xODoje<v-~Iq9y`Z(;7qcGYYIvi+y_Q!5W;@s)hWI*F>ukoB=sA}oL|g;
zL^g9S@bKuGx@6y9d=>)}|K_6^IPwzTn}H86^C@V}%iW4CTY&i+J5oazZXss-=tQ#R
zD!&&wWnBBIQ0aC40{U->|MaXQK;uc03HYz@0`Ev7Zb0d9zQy05whvxc=gB?5xWck2
z^$uT-#NFoK)84eshXUl@X*f(J=yhR=jVR$eGVuCc-j9LYeHfLL^jfo#$4&uh8%!pk
zG?CPQ_&2C++XsA2MzZ`NuVSTi%_Dv@7Fg2$8NUQe?Z;<)DC-Y;!D~%)@|rxN*X=M3
zlvEdaDh(jBg=mvB<QpHuCXNEN01vL|B=eOQ%KQ<06(Q(Vl>Xl_7X$u)!}K&yTw6iN
zl~ToMVHYEM883)zUin^4*rK6CzuBL?v=<f{)9Be8Em}$vZj2VB(z?PiHZ$yMC`@3K
z2~C829wnEx6;hojlS-0D(H9>(yZM80-y};WF^<uW(t>`1+85DR%n%mnl=38s6G_Ap
zK_nTuf{uDmo<qWk`JkXBJ9CAK*!DbExQ);`BCMn#NdN5bk^mt{up=u+gjfjWvSFc+
zfq1@)gdG%;t}YUSIh~x4Au-)b;P9nmEwLgoE*5ko(Jk5_wO%6VFshr}RoiPpcHdOM
zV`bzhEnO<4Vk~zq7iO@sGdoXEvH+8n!fvD<WO9Dr7^vOtNo1)|frPIS97v)UIB>-(
zp$p=~jcD1MfYMjtFneeiJh1YN4kw-dWF{4?bkWU#-or2m=rcIa=s<Gryx>Oqt`?@!
z+M>Fa(mEGV!pOL)Vz?}07ux%5jqn|zmum$q&RZ8;;KL|xjk*?V?s}maWzn<EjY<`u
zxq>{CMUi>yg+$ukWa}<-47A5!yUPZl0d23iwfV%_(B6tLHqK<_2B9Si`?x{4i_q<j
z!W4wk@`X7FeajaJLW?#DYbm!ovxDX*0=LB6_nxpU+br}&ggcvsT#X#0q@ak&B=evU
zM{@k4HRND{&=xVgwg}UdF>=<d-6G^5RCTK`8lin#g&7Dn+9r(D$ar069rPyVLg3`E
zUFd?H{k98}MY*%@!)u!>gC3O$RPAt5vPDpnBwMrtDX~TCh;|#y@qKp)?i6?bqvJKh
z0XLj9YZ-1OB|8OY(l-duigyTAkgvE?_=5@!bnUW>q)0+l@^+`t89Uob!U}{wOTq-o
z^0-w>qT;C=vO=QWh;JxxJhw}zj;O}n!bpU&cMDmRx2N;*JhEbs5Ko%z5$a;+l0CvC
z%CY8@Q&Bch(nh*S5Z0y@qE)2VULgw6^7aZ75enKTOrr$9Ti;(t0-6W?!ol00_X&}R
z7O|gc+W5@0K~cariiV$*>;vQ9?HB4GTEYQg6GE>J2$K*Rc~Dr6P`Sb%h94{xHdCpS
zJzV=?Qc|e}hlB)b+oi?Vx)a|m5Uk|N5QW&oLT%)cby&!u7>lpXjwa`i0Y;~&XanhT
zM5saA7x;NpCy863{Y4a*@cf7nt(PNwv-FQ0G|%nGmPb6Sj4DSPNspt9>f?-YXHeev
zXpmQWRH#k4?G`F8BO7Hsh8z=`qY9^w3E2pBJT9z9h(95mLFmQ_p%9@JMZ!Xa;!g^%
z5sEw|6jQ_Zt*-TZKk(x=(nSvmR~8qoCyh=EQ>d<^4m7zVKZ8l*NVAz@1!6uc2;^29
z(Scab2uCO<>G2t17e`r5O1^h-EMOOqEec>ok%e}miF`aKOraRklE=Fy0LBh^X~8fW
zdx8C(Rzt0Qa9+qlo@vEG1rEU_7X&r3pOJ6B<}?rwBzG4H;Uu*N<YjVVw3d|pC0s(}
zlNSXQBFBxbJ^MT$FZ+q?M7%Ev*AVa1CBdJg7S6cPs^?n3Yr&QwO>2YwzO^9l7uAgB
z$(Sp`R*LuK@73=L0dLCx;8{ubRpAtJO}Qp?U|R3JE;yio^l6d(w*z4h=0d|QfzPcI
zEfANR!U3eaa#MhF5IHxbr?-U~j74~f&<$--C^D+a#r3)TBrKd9yeIJF+FfBib!o%P
z>*6;+XMlX?3zoX_1O*tV5}9>hctJUn%zuQt)RH9eAF~nW0|!{{)`9$!DRLhPN@>Ld
zA%Mv)dMLC(*-hLR|2`Vn|3PX5i;<+`V_`A!`}|mViO`EDLJ>l{p9<HhQWqPHZrlo}
zI?6RWoSc6q^hb=u=fYvCMf}vrUh;yPSKuZ;<E7w1%9=#0Nz)g?D~fV?_`pGOI-R#8
zuNKJz<zE=$qmrT>$>NtkO0KO`_(&z*e56#%vq4>2xXewXy-B|^p*iidUHWVPDCm1C
zKg<m$8A;&%Dor8I&aa?0#Dd?+vRA@9QHC~`W$u!3-(ZGDd<0*2ZWe7KQ(ynUHT^5x
z(aX41LbLO;0M`J!b$O}>8{A3E8(}r_y!%FYiqQGD!WD$pzZ13~)cO4nJnaYJ8Eun3
zd=Ov<D6^K*J_)dbr{K!Zf{~-j%|1M5LsJmgRGu6Xa@#_IP<|EKQ)SE@occC{zGig8
zRe_YXiFP9SUxic@XZj|rMQHyw;WI+}z6+Zvv>;tvfK3{XLDEW2v_}QVT7`HByZhUV
zD-bd&#q|gsP>M4VN>YjE5wfVoS5&Jex5xLDQ`Zp}q|`&{ong$1sW9pXtD!hda1dux
z>yD4PeReS5=3$tq>)UC>S;+FKMm$3yX^&P6LVv7}oh8haxs#^8VjKzP#f^x3jTfI#
z`QKD;&aVQDM{K1{70`;}KEyjIiZc*us~2ZbEk>WN_oWk1%#jm5jwJSgDiCE5dn4LP
zgLsQF)(#oCM!stK%#L)aB~~VdMllyr+MAf7B*am?jctOHcom_?PU2}Qr)=`<qT#^c
zxm+ycNPHUjL+2unN3<Llu?V3QS8*XiA6&&Hlyv8+fK~xO8i1s!eWI1*wwoA_Xx{GP
zQHo|A_GP<V**B2LeJ}{AKZ88POhjAnA&x^R+*8b=HnnQ8vOb-8*?=qPAMKOtC00SS
zYF^?|iq<>YwzLQEU1cZVF$^Gp=0Zvbz#KO%9Y$8ia$-7C6qgh4QHu26uD|UC6kGpe
z>_R?y|1dVoM?8WQ^?aGfq+s*U^oy{FZ1nI}aW!Rl<LmO=dBD&EZk+1HDkN+$M9JPy
zTt*F_Q8%fg9Gf@tt%fL490c>#!ND+wKlzChP|!GkaSuWX0pfdv;sV952*uk(92n%K
zP0XVb_s^K>RUdfPXTvKaSk#m7;V=c=3lf_nS!}SFhtSPn@hn35A>uu%l>6IRb7iF-
z%JXa-$r>5$OrD1R&}U$I@vBK5?e1&7M~?$EN2t(k$(DFAh7^y9b|rU4LA|;j#?n<v
zt02NkUnU>}!^O80lKMo5FjdNpCB24zluM_Uq_>8OS5Q?HWHv^!r=lpr6L3fv;N?MH
zM~WH9rDsL)J3?ud#4=?SxdwHtEOMMEgX9lx=Sbl9n$%eMgI`7rFs(KYB7PuBoT!)4
zJ|1j8SuUP$VefM@(n1PGMF)@@<Dx~fDq5UGnUeA`;$4JHu`rfovMB$yv*e*r%JvtO
ztY#H)mR9baUHI48Op+d936%MH5Vtt+GF57CocMsMmlIZJrCd3lVCA6ZW=#Tzy^Du*
zmvN=xRb_`!FsqsviDq?Ay!24stJV8ugybZMN|HDwTA7<5Hb6N|62y5_j%L7@e}5Tx
zfT|(kiDC?*^+^<eN61)1oJ$F^b5yNm%a+MAcN7Vk4hzJ8Ye3#p{YTUkuTw~BQ%i(%
zHQG3`wzx+lLpL6Ft4lI#i#17R9dR&acg8dC99^BWiL4GJcHS)DS-Y-S6UnF76$!;%
zF@9?X;=&+^Cb8MDq#0FDtd3~=>xq*nTB|J+cE}f<B7R|+Ivb`Az%`L|^~GQsqVx70
z|D{`vAL%cPc4;6sq)gr{e7vwCFsX=>W`^83IR~l&Wm42Y45b$ByWgu1IRrWo-#L&r
z_3MI!)(yo#R4J<=%p=qV#zx|93Q1=hi30P>t;R4qY4`0WP+%#vDG7!dg~ld}2dOG)
zn(PaGL6vp5R!VFJ`97toI0n%)&BO}`-D)Q8r;xO~xd@M*D3_`&#ZTB6*9zts1bem?
z{TZ0uMubxlin_n8_za<q?Zmg#N2W(Tk3R#^>p7Czth_}kZZAe~v}eS{+po?-PhYlB
zfS&TCLyC9@B~<Sy!qJq>R!Zw6-e6$;RPh?7sqg3~Yo-ACE?lB#rHVWeRzUq2-Bp}}
zN|?Hdx2gSq-b;QXXWSaJ|66xR(W-ec0u#H7<0*z?(qB#Q0>(g+a}XAciQPn<^u4>N
zV}^-6#mC6=V^8riRsZb9kXM(1VhLNq^1a3Pv}5DN@CWkg7*9LGNkW=<fy%hMb;*P6
z(4m))8YAR(N7A{EcnXaS>npZn(x&$lm!q`u{eQ@HA0V!#T#EYpKW+nDdJ@Y3u?7*+
zK{we;eXjdHm8a<b*j|_}_D9B%8DcE+UjKn&ZS<by&lyVjEO;u3D1~Wo#rkNc+(F_w
zYQW7ECl_*1X#?SbuDpUP`AsxS-h+P(Mk#TK*bGA$W-$FDA6JI6;1&*r+DLi0)~H+k
zBJ@nhoq<#~Oca?C+z7E2GqnClv52BCFKE}X51_BLBL^FLSx9P@=t`>RM+cM4QQ}~V
zG9WrBW(=U{*b(Xv6gQGITAYoh-W@H%MP}I>eo@Vm<!43%>`6vz*rR2Q6wRb@ruYQ?
zT4k*Gf#SXD(q$7}A=4F01!5d0x{&c%FnQ60zM91b1X(mr{0G~{jTb-C12HLcg7|{N
zv$6C^Fx2EzGN;X3I><--^KnvWx-HtBjGqj3o*tp`Q^cDHJ(~hk3vGKjRlKj4p$e4`
zZ;+ovgmUsVdaARQr0#^Jeb97RYsx6ni|Jx(jtc6xq2r@TfW2LgcLd3rDgKH4f@eVi
zrN|E1qBH6z`OX&0GgXswU}~iO_2<aT2^oY(8e%DB8<}Z~lihq8hf3BQQAZ}u6XznD
z=X~)c(h3V8m8jbHFAYzZ2jV_9zu^htLYPRW?uUZ5B^QPxl~I>4ViLPhd_-xZU7Bu@
zS51f55!(-1C24vv+J}r;1gj>Qc1_tL5zSuT`|P4Zpm3L;K82IaLRbdWTmplLp0FC1
ziYwje;pR}`xgvmvJ91{+&?PJt)zZ`LVhtXZfxEw<@-d`}bS@M<(WWtlqEUh1oWo)T
zG*3DHue{dqP%%%5=X^hCjO6r@XfMg<sF<ptw)D@~kUts_gVf~iO*oMPz7)rej6DkH
ziVu#9lNI>%CjF$iT8*GbvA9!BpAGG+?q=BmPv?H~qUV_<+eMXRe?`2mL&Upx#qnAM
zdp#462?$1f5Nm5FxMR5fq&;P%t2EGmJa{OdJ!I<qK~#~vqUdsDfl@z{qI`bRYQt$b
zvtI8`4h_%;lGG2PQfjBtKhn_d_x<Xx(Li@UKRFQt2<5QnWVEkD^m<2>C*`~9LmjAS
za?@L1h)|x7{y0K?eD!}&W=%?mR&N4F)n!`wE*U$jPCWx2X=2v5v8UM5T0i|4#&>Ob
z{catloLJ+8-B|~Cnr^1MVJRm@{|+Te(-ZXlnF~HQ(r4(AOL}{KDLPa6-TlQExIi*3
zkUBH37!I{pbkRGDh$s!vmvcn$N|t^BYt6{kA5v1u=Qj*5jfYs6nB>%X`tB?aMa%TT
zo)pP2RkD{$M>|?N0!YSLSnPjardN~R7hxKzyGx&hA{Oq_!xaqru=4p{{Z5fmOKFo9
zdx3~29@Mzp%dq|X{CBjgRCZDyz+`#+t>4SQtGD%5CGz-oPrq1&NlfollsFsJ<m_WT
zp3+M<9_!Ot5+^;=!*v~b{Ol~%YxPJYe$`JyXz^ElHwsDIH~o3oZvkRMVY-(OL~A~|
z5Ml@-sn?@5lE@i?kd0JDX}HfSidk*g$y%>#4Q3|t8*hMjGh|MZ#$cHAV>Foz?HKsd
z$&jr?ge-T%RMskZ8K&4%1^%?Yca&dQ{Xk3xLy%PDW2nGzKA8>hAcV5!{S42U8lU|Q
zM{xWd7|_mhKTsBGFwqiAAjdx14DeP6B`OLrz|}(v?h7+4V_?S!!y1LM1}IE4l?(~A
zpTJuSp(ODktd)x^8Gd~^f>M)82DKM5TA5^+?t)-=cf(LMl6PKeiY9mZ7~HYvb05Pq
z7iCph1ib$gN+4i{pZ5>!>}UtPjse@B=h6QOv$KnrDaMX7G}FsabXoB6bCA*t6WMDK
z&PErKINRVLd5t&R6_hg0&e?`f3JOWz<`^^;5!4+u)K*d+3um<*nqYv77Abf=A2gvn
zbfntTXq~jW$dF_}nh#eDy{n?m@Wx#zDGoMzxHZD}m^D<hvz0E@8V&{ak9heaPADmS
z87)fhosBbj<S@l-^wjEQ<8oH?w~V&g+4cHDEUk>OWc>@kd=qS(Y@-6C%x1<K_M(gl
zqX~cH6EZlvcQ(4g9@?lPcV32T5!tPkafCqG`uZ5V?*xaKE0bo0zElB|biIr5ynvYb
z1B{g-DdzphWrMOHGrF_)(`3F<V4iV_0xg7iB{<b;RlCM7uL#NNNN%f)nJ8C|LjpM`
znly0v&Zt8PFE<(mR|LNuHo~1jN_6Ig@n9VUXN#sd)_T@ts%ODg_e!QBd#afIu1yc3
zV6+W}6Y>pULP4Afo+?lw@4!c~F($a5Dl?T|d!&iuO!mZ?^h{8ksVnkvP8nbAiw8Vj
zyD57(icI+sZIs4UH>o`lb4FX!f&a&7W(s43wuOI+*8FmDN|st?tasNF7J}Op@Msd=
z<^aDr6B&;begXjvJDOk*K-H5y=-#A8t^f64y>Fhk14+46#yE22GYqHhtBj74UuTmK
z^D2E8F{8IB2=!dr+q8|No+;NPAxRd!Pl1^VI6l(SJ|;N+`_CUu`<dMTN9EB^{Xb+u
zR!*wTvhq7f-S{n9D-G;tn$Dw`@54-KCItIUGnG0bORymc8@DVqxfu{Vv%$1gMK$tF
zJ3Y4)l8A>F2jrI=a}+Ti((x@OjlDtEsA_N9U-MyI-r^s{yjQ!;a(94a-C<hZhEgZ~
zR_#Kf3EpTdA}d?Cz;X?SIw@OfQjvK37*TQ&9UG!X80o|i$N#l#RaV7i4q%x}3|bb^
z%aJF^${1m%ucL{lLgr0xyVV=!?f#zfeT*oQp^8zG5@n1*>J{Q>(@-4uK4~+`%EQ&l
z&zLHJW0dYzbo^8v4ZhRaaVDnhPfc9N=I)Lnb{BScgiDXI!P4iRj*~>nGqg#O#tg2h
z$KGTE?OjN94KV(Duw%$CgN&q3a#a6UMr1aD9Ui}og-KrNjYEK%RjdKU+iQWr+F_3H
zkpe33Wu_xsilU%pl4Ex^2&T<*bY^34<vPbymgRJ`mWnZ2(j?!JRgU)gj^&vZ)Au+!
zU~T$m1%G{z8{7)Zp;46zk2wD6hGjeLhGUY5;NHiMc}4^~e{?L<&|<o%6<@BA%FeEj
zm;CORCD#z+Oni(nT4{;Gsg;VNgkQ)S5eVaZtBzcVcZ-lJXq;m0u{`$Sot}6j*e2L%
zz7E0CN=`hMa_L)wlNT$--pNkzeFnL}OGz!A1Py|nQk+5=*{q&Ua{{Tv<5!nneFsWS
zb0ntTW?E5HQpzkRgBpp_=Q%xO^zD{9)p15OS1r*5L*69P<xT|gb_35pFL0`ahCY9?
zqiYA?*~LH&B~6`UjKpxzNh|f*?UaU=c5MIKlNoRYaI=fNoQaV12c7<8BBf(aah_Dp
zrg2>cdm3TN=nVTTcwqp_ok|KXaf&jca=pu(Ht7-c{O+_!i=d=&t|C(Uhr26Z)j*!z
zwEvHuq<h7fqy$%IM+L>v%~qb63$w!HFk&eJMP|D@uM}v{wu!yV4uOgnT*;whP7%_8
z0O!k0#rox)<5e`ZFzHjxG4AA4q%*7YS0kNakuGQ6*3*9em4JI;Gb{oWajy@!>p`q@
zHJq2`FSt19ckpI5>^z{T=UQU)a1swnj;!OnQG*E_lH@#=O$_<XosVNtlZLf)4sb=I
z^ZGd7U?Cry;cRlD7WA%F!Di<zA9)bVDr1CnZj!S%%c0WWolCV8>FQ>F*nU`8&WC4T
zb$D8|4N|#$=f8Edd(&<Eo~M-%D3Q#G1pidn<$PO-GL8G4?b+n*R^<E{gFkege-;O2
z$5bBin_h5sW=OqnI&Td^QXiGedloJagG+)3wx$HROmY_Gysb6Qr}b?Z2v<pJ9o{M#
ztGOtdQ1==x!I(mSZ}eU99emreBF1NIT^Crd)2t^gTqZD~KAl~Lo3M3phKq+kg3^4K
zD3;b`OI<4B!h*&!x1Fnp_;cYxtH-dTE>R6I4h5p?XJ-UATU`&ZqCc>rYav!kpGk?G
z=0bIP4Oi6R>{d#RcRlAoGok;n-W!79w)oAKByyq$tUhkJ>PcdFj7Y*GW7N{rTCP#7
z*c@u=YGmB*c3!>?GUJ>JP6DY%U38L1I}nB{-0I@`fvIq9pzB@MdSIk$Pd0G{O?DlQ
z;yV->=IwxFc6P_985DCT=B}>1^m>+S5i64i=DS|Tw3ZewcKw%4d9ydV{=t?=F}q#;
z8QA%l>l0Qe8=rTLXVrDaEms9oW6lHDiEOxdJa^rprrA_s$*}E}VMKp!h*yZR?A=U~
z{$JNxM%tZRW8h3jCphV?!v>?ho5hLtOncM$>lo1Aoejp9u5NRY<=_uq!y3bTSJP^e
zm6Z(TNoqW}v(VeElb&YM0&Bwi)<6jN^?IAjlalxt2dPtexBg7E^A+9RutYc!?^YjK
z2GqzFIs!`)hjqg^+)XVd)pQ%iip|NUZWGzExmgRhP7YMV)lPLAJq1CY?xahwQ^cAV
z?cE$O*<N*VTg(dY?0#<bSz_K8=9bLB3KQI_Fn#(?b9-P<`NbVLT=^KRy^q(%^v<5;
zcAeGte#_k2GtyIQ-5h<C)nLdF*b=ZT4LjlHYs3_3dc`frl_ucsN8k6%gf-uWR<z=h
zyhbr9vaD_lFI9Nymg_-zNJS3rDOv=BOzw?D1c&*!FV`UGVRL^g=w*35o*wSiG1|_q
z<TBQc9Le38WHyM=NbM@S_ll-C8~$zOk`QWV*J<Z}IFfx|_hzBUt!=LRVK)SQHo13J
zQ;>|??!JtYPu~}N`8F)-BH>#_^y>nWyQ|b~hx<2<c28>k$vh6q`$CnxG>JfqiW)<4
z-7dM$P*5DV<kw$rLOI$VOiLi*v}+QhmvZ*FKjcyN$m8ztwFTKIX>76kZ)WQM<}>WK
zUxLDyCsN%d^DFnytOohMb9eNhNY^iXjp+a%p14&3t6jRmV*vV(&{wPr+s3$(^m$>7
zdxVFFLNCgKj*RWt<2t+yHIkTH8OxLO`yK+>6z3sG6Koz29Z{G5aUK`c)Qa2X-nbRH
z*xBuAC%*%2Au?NOLJf~QF0^~2?}yr<z|6fCE`F(0v1JV0yl4T7#e;1;q8Q5$X&!ZT
zl%{&eTj66My~J3&tcENHd*m5VxNV}xGK_~*G~HveJ0&W2=I!g@uCOX?j%!L;ZE4;r
zk6Es?ds5}aW8}p|O+Bv7Xm^i29!`wW=t7S@R%9P{-{UIU7FWGwwFlU?wJmPTC{dfI
z9x*Ny6Zh7|0iI5z_`3)5<L&Pr8tj&yI(x!*r|1B{ji-zm;OoiEi1qbMLphHp_Wo@(
zC>I3B>GER2*TqvyJX63wWdWWADr!Ka$I=nipgg^6ia6<2J#}PlN(@|+4E3yMWU5#7
z<k-MSXyBR4RJzu}GX_0dv-0N2$KaY|!en`%F+nArfJvuLp679}N|9qcS7@lr6u0#g
zp23tc))#$R+BwEeGEeq=>ww}svOPz!!M8Bi^EDee*@rzlvewd5o}Q=#=A5}3<XqAv
z)+VJ~@@&NXzu|^wMF!rz>*>fG{QjM1YZTS&z2`v(s#Nt3vo=^@QE=E(ekB+d1#rkN
zrGEE}V%9FwdTkYuDB0Dk)R7WBan0&g6RLMgoP6>L+s`zxBs<uvgA-!DsOoi;Ic`To
zuQc>lcz(kP6Cf;ujr8U=6*Qog*D@8QA@H6}U8HW;!Ape^AoMI2HkUd=`g*ZEDCq08
z4mFbRokn>zZuN6>F2C~h%UOXP{gM{RTkj<jLw_iOCBwa#Suclsne-TyVx=(xMt`~}
zSKm-6c8XU+CGDxL8gytmgm-MFJmf>kiUCkePR{UJ>qEOY#=kDt1y&{L)sS%gdaqH;
zQ^As#=#Iw3o%ia6ks$CL3pg&h<Yl3Cp;FMmV=Z77c}^ma=&X|Yve$TqM~{M<4vI05
zlAAyEz3HXEEQ<LpgMSGNrY<~AtTLih;epq8dlVS?#0y?hp(U=<3$HSTK?bdJ%4+QX
zVrN$)&`2y7oI}X%K`<T6E%Q45g`$#_7v)|dQL~rjh9N6*=2f|5Z1jCyu96eAdR@oH
zfvsUOcvXh$TpAqXuoxd{oyt2k8qvk7-eH*31oqBRWZ&=@<9|yns#{Ji?@^d7xUi;t
zu8fE={D+q)m+90t-sc_lGWg>J2xe>fyK^LKh_@rj=;N(ZQpC(Y-WPQg@x^cZcFS)%
zN-gBYR47T_=B*)n0g>d4ih;ANA>JuG(x}IIUv;509eO)2C=bbb(-F;*nM=OJyRix}
z)0cX8Ra4AstL%!}!GQT3{tNf>V`7}8z*XMem@>2Sy{9os&u!iZ?UBi?o!%L03d)&V
z6C25X@7eAMhFtbebwDutmiGuZ1o^k#JKd@1wO73FHi4oQSp_GmGSO#{5s`ZN_`pDt
z9f2M%>FeV_zE6nJkjDN#OsV$%KKD$Bo*3=ZLxbq}xMNm~%g=*HjU9F9`|3Wmn4hXQ
z^hwqt<(w8iEf~MY9ew69%QCz81hdu+eSKPBz#0vnqb`Cevy}&)OGeC!@sjFi`rKr!
zf~NX>VFH5Y`y3ajjZWDalN-ZuT;M3rERiH}7EErX(_>UpwUs`dm8eACpFS^niazdf
zXtxItw*psOIDm|D(zPu<-<kJsN<OdcDdxb@C#)4=4FR7+ge?-(tiC=b$$g(sEd}!I
ze84A@nQ*trr%`2Wjri*GmAN=l@0;x)%C6B|y4a>16!LY%a>utKs+gYx`+G-M-#`|?
zs@}ez8RvMj?~f344Ds#nf}GDK_;$c#kizQvK4D~@NxtxIn=C{*srJYjASA#83$yG!
zvTr`Dc@{PIUEn}*j$U83O#a%_22a8z_(YSEKYi5F@b<nI#%);_-y#EwOB?7bFu#u&
z>08W#Pmi-RII|O(F~OHvzi5K*IxA)G`|ip#31V>`zUGtQ6G^f-Gf$Q;gjM(4)xL*V
zEZS`Fg(o01V-D`{oyfp?g}z6bb%qnZkC^07#l9j_jlb%vWuW^VUpI_qlbda4m4mSr
zV8KyXxGcshWj*yR&$t%6@tuQdQe)8fD-B@7G{{L_5rz`8&g>-3;>_*QFzK1r{3rTo
zOq&xA!y!W&@Nz;`lWv&Ibyd{74RDsc7Q8pWiRP!&)XjW_sTAsKJ}012Uqj8eFw1d)
z0^Twbj|j7f-60X?(>@f$IT^Zn5}s^kQLdkChL1zY-od#nufN%a-0Em%20iO&h8Op!
z^WwXji&)Y+^fvcF6_+g7>w6uPzwV67l#H%sy|k;p`T4J7Jz@<^70(C#;Q48gS?h_<
zE#TWH@U&tb95W<l|L9K2Hh-!_yQRj5&B<N}uDE3$;ecSd2j-e4z5Lu`RLF(TTcO^x
zs*a^DWjze8ZSTzcjL4qW_R{q+hF^|aWx8gMy>x@&NONjuTph|Nl2&8MHsMLx5)aFd
zM>l7^EO3{XK0GP$w!kN>WbktnpQ<LvyCWKTb7&*O^I_s`VYYZF={uX|f4&cW10y*C
zW>0@&F<S!G{NZn5&z9EOEDH8k^QI6B+=i(M4CSwWRHr)UmbZMNz|Js>2ZsiGSFpG<
zuu8ZkjDdY3ETNoMHixVKwMvnEqpCS6i?m!pqSF;EuNY=wWm%4lDHTUqtQ_^-5Vduj
ze2t=vJ^RjTXwj0a?J@o&DaKNS*q*T#uOEV|SbP}RA<hDC@KJV4<1NVyEUjvBLe5{-
z&b@RHIDh(;b6j=HYov)tu(&arMu`@<-Ys)Fd!&W`RG?WZKfAXQs?5|HmJPJCt$yTy
zt<d>iel;MJWYn@akn($DbYyf*%RXdUwU&iPrle7A%S+lws@1W)$L`p=7I?c@7DTGo
zvwTOWMt$aLsYU~fk<-h8FkF5);{b?yNAE(y9Ny6KmPSY#(a2)sL>VXK$g|2f0SDe3
zImlC8gUI{FmS@;m)FrtDI&1vWStZF*icHEUTY@p-QoE)WxFRO=lSVhQs8PJIt=rU{
zK>Zn92cMN8IY(lg$kFDOLPVU}!UC7xXu{;QwET<Elvb89ghsWtd_rhM8_Nf3&$BNj
zJ$3=ZS9bW7$Go<dj}%2(*3J^g<lkry5uw@-nYKP}GoZgDCVw9*0c;&Hc7T7<i(qvc
zmtxtC#P3sNe^N!hcC>uO#&4Z04veEV)dF{BXuoF{%X5TuT`eyV`rOs>FO}DG_uPU_
zAnz{8a8a<Z=bV9)s7l=}yO3*XcMH5NNByMf2^ml6-i*EZST^e=X*Tjly7cY!!^@p|
zTi^{t`9lNJk~B+ag|Zfa<Xc}$Rpe8>pJk0bf=|*dE=tNWqRDhs5-eN&iLtFUSgJhO
z0w?lR?80G|%?=1Q8f*DoMDW#23rwXH!E3(do)$rFx#gRV^4sfbI3RzEwI(^V-V#YN
z{)UxZy>*uU%$8m2E%5FyCGWS<0^ic4;Islu8#N_=HSqDEOi<@9st)OQ8BQ~NB+FE$
z)A~IY3!^!Iz>>)nX?EOFO{8(>-nY7x3gxG{ll*OjAadslEKp8fw7_8#6%=>fvRz9t
z`<-57ZVP%=#4jL_;;SGj;2%q0^mK0S+!Q~kWeWVV6YV|mz*0p)iO-H-UEB_|v^wBg
zKk<#lL5hBC`HiW$@!3x`FO*sm(bLv$L#AH>qN(J9(pnxq@dBzPzp>o0r-nC~@o@iV
z=-oworCuK_IwWhc*MCegbd14{^<RL38na`THHP`3NNEj1?BClJPf<gOJ}hHbkV3Uq
zCnZ(+g@3b2H^JWi@F_I;lPj{yWXElo049jmhaAlnslUNmL5biBCu?VgUe2K28B-g#
zk<Wmbr%Jr68Y#-v+J_V6ZvCW*$4-HM&18we(j_<RO?yf=$mF@-9%3=e8DkN47d%|*
zWfj#_!cMDohds*L9bknI>&eXS=?A?EhN=I78!Q{mu<P}*>LiB{D|`e&?k;oQb+t^e
zvpdliS(dvGEJsILKMBZaWOeI&HN~9M|K!{hSmU0sQB0|J9qRxs?J1x0*V{+nAumV~
zCPE`UNwU6IQgb|SmF>R)=D49bt}U%En3fK0t$#AGpo4Xz1By)TW?ds9$PctuW37#c
zTOVt2D8`Pr_G6#cSUbV`Q3Nw!1hXHq(#K#S#!1#+ALPoGR?7C;o(xW}fgh2P0A&D2
z^)I<F|8srFObh*hArebFCR=;qc$du6tn)FPWY=`-1F9NXIm0@TQcqL-Rc#J9xXc}&
z49TKN#C-5_i<#EdO#83%tZ-RFRzrHRz-mBnC$WyjgoysSs*t_;MK^qu$IeC8nn?0x
zk=0qLmCMlkZvCguC7n(1_He>->j9JeG4&so%fHRUqW1IqWq(6OCaka?#!!Wg68C$n
z?d%%6(T~bV5$mi~F?pnn&DLoyDDUf0>vo>TZc$>jqaVO&hv;X_rIKQ+Cv(cTzpSM!
zcCRj5pED+RZ&`~mS0Bf>cRL7M)6~lH1QbYI-@wY|-@De6Jjy71VRh6{2WCIZ?N|+-
zBJg#|1rI-)<nYegR-`?lUq%%7g1{Vt!_bJpKxv-6-xze*#S1~vR&Y98u%kV-)qe1O
zPikzI=vRv+?+FjTe^4>9+3a_LT9ofw|Mej7)I7L=YKH3-cdcqtV)k<&`z(H$$f3H`
z4^}ypS(2aMb-l8-9Jlfn{bt)EcreN@)`;NeI(}=F2(D@9H(!BZLKD9gD!oj`b*hum
z7m{D0BZofv2a<p<P#-KU{Dv@Ai(C8YR7kY8gP%WZE$-ykTuteFHS^iO36{le%aN5i
z0fD)l{Zvwi9)3rWd!6jx=H>wR`*5iPxJxbi`mIA2JJbETF}->X@`KMFQya#P^!vtI
z$Bp%aZ!^=@unB%MnGXA>`u%QC1>Y?x=hqyz9ZlfqUw(uJnWBOhfBoR);F*5CG)Spi
z;0Nn-%H-2xzX%<I75?;l%s|fqzv1X9JQJc^;N2(3U%oCQlW!kb^S%`f8BOwnY&J4L
z6|0y05BZfxUGhiOSziqb%v5I@ZSqnTD@Zv<{pvAWZWQ@FW}ZKB+D~PoYOE=~>4#o2
zbuT>M7kt>s1wYMFR_doEUmRi$lG{DMAx!_AM}CnSYWV=Sm#-24{h=cnILI$B_m-bh
z`dI1*R|Bc0l^$+B0==-6lAm}J%4~)<)>&Hq*6%E1mj2nVSfJ>8lg>oUb+)tn){{Lg
z(8ZRMrfd9{I-{>by#1T71RE9VALT%aJU>re^M?QzvFqd5@f5%vuPgd*LY~j-@!#cw
z+Fgs?i{<Wn3H}dQJWtp1uZxT-ButK53782wJVr@u<bQ%8@yY(YBkJJU#orZ8UsZcn
zrP<)<AcOqbcbL)|`8$w!L#&%LCe42l^S0*z|1>pHP8saqnt^vm`ma<_?Snm6y_F~W
zw(w;tNLf<V6l;=BjPVa+A^I@ZzlxG#4nNcUP;(f?Z(Yfp3otkinqt*b---U+Q2QOX
zC+<-KqrLFeEP3qgafFbWX8P+<P8)v8qupwFE(tT6{KXxbTk%e@PEyZo|07JTum%1-
znOs8rduXZNJ6HM#`Z50>^?$(zhu0bZ-YkAoF8Hs}nCd`cy6c-wo(n=YlmoBO;wp8$
z?*E5cDR)Y`yZ%-Mf)V%q;UbdUdXKn|`~a0NuMQ5%;}88!0wUdb?cY*Ck?O`jDw4nG
zIEpR-0;O(m{YP=MC#}u^m#JWNp;9IfkzC*V!#zDJPWtFytU_?ccmF$D1P5pXV78_R
z{agd$9T42$9kAOT!QPgD1S@N;6|j+!b!ZZ>kP~IHkPlniJ_Lya7L3zFMYP7MNp&BH
zS(W4fxGYE+hcyrA$rzJ%0mVve?b|6JRfpjD-T^v2g13eS%u`cS+-fzpY=v~{Y+|1<
zv&M?j#xVg$m9+bY`{uXJVMXyg1jhzQhVJnJeVK+{GXsXBq%|Z5?t@ZImMjh!tfJ&W
zwU#+nhmly|^OL(?YMd94%v`kM&j7gFK#4|d4Cu#neo_!{h#_p+8IZ{kBK8NgW<H#L
zGC<=%{c>~u;7990eUX?W0&UXq;($WNWcsCmj|`l0Js=MCdDY*s%4I<J5y+gukgnf?
zV)>mV0VQZ`HQn7iBcYy#n{km;b|=7rj0ui)kh~uROw>?|DjJSW9S-BZdMqhX>)<_+
z7XjG<?WxB<xU&gH%3s0AJ3S<pmzI4B*u{d<P7w$n-lMkeRtCZca42|M7YLhb3i=3v
z@clmurkDbi7@yKx@%1)Ac_3OGV12`2fUR;1+{F^7vsWO8IUPJl^q&hU^3FkC{J{&Y
z6##R&PvBA}w7Mk_HWZZoDgQu62A0?YVI7RE!GR5!R&Ei2O&HiaIxr3eyer*n_5}l<
z!5=q?2rN%ZBcT-Bs}dN@3^ddT40oljSblPk$0W#_5#bp2#4dpXDXSQ(mBzOToUNre
zQ-ca0ER&Z9@NEH@yd=Om!WcQE1)elgr0;_-PV#`E{oD=9XhMxZ2Qs!YM8t7qpdZt)
zN>*SElTu!0Ntts3$EpzINFdxVpsi<@1-jZJ_$V*%gs)z1-4?rft-R-ogxw7M`4s~4
z9mvcWxr!IcjzWG%0~=})Jaaa%iV{KXUxA+(=x`-4ih+qY0@pIx&r5!S3m*nrSnK|0
zfrE^q%vbnx0{t-HKDuO+MXQOqYOJF)>`Pz?heA4h3xpegREs2&4gRbO1y8!##xr@z
zUbb{4HRaybr8B2N-ey;jy<{WHs>g~u&9-?Oq#;4JmkI=@h1$kCAShI_^<%B0qij9x
zDW@&#onF3$9NNyl-X_;;$+e2j1G_zYM-N#EWpivYRs&kjH`Rc-XJl2|1|As)*SCqx
zXK~GJ`78#{+SoFc%7$P(>7Qb&L8bk-DrV16df5c>B*kV#Mqg8G)6rHkql@hoZIu5S
zN*#nwcC{5C;+Sr>zcFBB^9)-&$?0Wt!k#s~Y(rTnEq!h92~6t5yZvn8XwVgpiT`LJ
z39f7D{rjw5HV>(7hRqXuFLzM<dl{7M&XGuUXjpFJSSM0A$W}^~4>GTrXb-c)B@R!-
zWm=Lm%%+yU4z|Hz9+l)d)CQkIpkU)+wptvetX}uelB>XWE&Ce54@#coH;xs^%i*?r
zC?RpN@DcFfvsL3ZL{jmR&qx~_eNl#CqipaEFA7c>ZEJ|U*L@%G`)debTkQ{(MY1$G
zR!8_`s74hsZC_Z-D~z)(KxzV)TVudBk?<3224v_n!FG!wexGF9&Ri%>v%O(7;O8V|
zJ$cA+Gi_Ris?WB;sbB+W{olwoooy?J=ozzZBh?7LTwpt**2_$yf;SC4559yimTw6P
zt3=kdf?4MKQrk5SF>fxnZBihZz0#Jdp+u`yqCq}$>3|>BNzAu7NfGO9Baw#P3eC5<
z|FYAf3V+{VgYS{bTFMuS;Y2#$7O1Q*|F=)Rtu19;oZ0!0$JOCJ(7!MdKy4`Q9OOsD
z_E5jPH`!iSr#PzB-e;_E9PQEq;}1B&Qm+@b3Px0P(N|l7j;ZeuG@C~--6?3OJAzL_
zgLYM-tp3%top18Bv%C1Gd~R0>KB8cAlRg4JU&IU?8+4R$eK|V_9tBW43YP{w_d~Ft
zFlez7!K7nBTIRJ5$AhZ*W9!nVLG>Aj$De{`$6@QJsNhF6sRqvm<+s<t&7(mL<nMkp
zl=ch{HtP}T_T=EN-qb(Roj-zCIq79rjeWFxzyc`nhb{7}cMYZMyMoti5ozk-V7SU7
zBT0`>2lrD^a826_a1=@#Q~nBWqC_zNa`0CX!A<`Jo4qOPdoEkroHy9nb&Ho@#H>Y9
zdqFOgSA;w;(C&#NjDO#O5&OwoetQX~4aX3RlJ?AV>ojU4%*<`-P#eh2zOVy1<{C0l
zKoL#+LQXQpJi|ivxM1tmnjza5IJ<4gRxM>!?nq{rXRh#udS`hQHIx>04f*7X_yo+=
zQKS)pi+c6~A0%1PXH_PKjA8uu%nC6x@OEy9BjP`sxA*rOP(0U$pwH^B3@PK0Y1RA?
z`1&q&kEtN!Ci+ZzwIgH-YaP8S1YXUc2z}3ms6?v3fl8eROa%od`ys=k8zIY>)U*d7
z`SytK`6T2WDzNujx-bh|X9u6J9Bm6Dc@IK-q_j67iy5ZFhmfioO0=V1=no}q>d<f<
za%yi34PyK!dxSpqph}!exwWk&Tue^MkZ&G_k>am5Z>gwKXfHR~z4)z1UjsZt`>Q;@
zl}1O&lt%CwRphy$Rj3DxPf?dpr7yM?XNAHUc}hl;4#&FA46=IK*_~xWqhx8Qd2GFm
zAr*SR@J<`_toHLtA6<AMz%8xY%fJ_9L8`d<2a*f}j^7VV>f5PnuYPOBs6O49v;A4M
z(MN|m-sbD6J9&+2vthyZ_6<V&4;yyv(lWO#(_)5rd|4Xb<;-|({JQ}i51uX;{dPd<
zJqNe%J?bsf&$xAM^wXmi_U+OxE8D^u18ZnHj~RPKyQtFsA$La@eC{t<mHN%S`JIQ&
zX0#ji)o}IECg0(aduMihY}$HP98+57%c#<5&47fx%c|Vo9;53yMPobks$$!Ter~DR
z9pCjZo-^L*J@DTp@zdLtEBJ6u_55%<<%f$TeZrmnPyRmi?a_=S>tedDxn1<ukKB5{
z@Y0QETjpucWGKhRo~rO@&GSRk7Wy9T_q@))$G6IN9X8YKv+slcx~Bb*YqJlHpH*&!
zDrU?5Lxwv&lFq!mH~hfCz47;Jy&qpT)GhVp;of`q*S+6A{od@3c`r}4InnFg1w+D{
z&+jh1y7u7T8ULI(_R8%_y*9S<=bvvrxB10puk>1j@*2(Z5$`P8ec`6Z?xka=JvY3V
z_Q-zH=w3R5*6aA8*-OfOd;1`=X4;lz-v5f#tHt%Pcppe<yr%r%6Eo&`)jc|L_>~^_
z3&+Ovd|x=j{3dhQlS#f+D;JzrrJoJt$2yN1XXvhMzWczSN<;jjJ2dW=uD>@dzH{M{
z;vvV2&J``%b+O*VnRWTz;~X8v<kT~rQLh}Uo-zEu)J2tirVMcADs>xrzxM5-MKxn4
zIyU?Ff>r3~+jP)szD?Pp_s4pzoKin4?{iUicg2JFY~#3Nqi3!f8M$Too{ul>r_}oW
z{>L{N592bQpH0a!{U2Lz9uMXFJ^p9C?NL!kmO|OLnL%ZX5-G}3qHM_;Nm679DJqJZ
z7NL^8w8%tdOOho@LL`-F6G~|HJ9Cdu-mmlh{dYf}_qoqG*SXGhu50GH=AO!N>n09k
zJ`ML{mc0+75-a5>90s$5X+o|q7&5l^4m$bYpV%qH8Dn#`Y|Y=uh@BnR2Cs+Bbo1{r
zS($!5V)FAt%U^w;NBcgXeo6Gq^Av5oD>7j6Enl^E@P_i*muh!9R_nfSi%fRXu^Nlq
zAG9NK=kJ$~o?PEQ`n&5}N~`;dm3}_MpWnRq{q<=(eQdm0?eSnkQBlS3ucI9$n^ngK
zGS<w{FZ=sOoZ=BreExH9dE~xv%b3f=@3Bsi0J_9L*QGbEGx*#VL{D&~^xGv$&A&FY
zcgdQ=Roh>$>QOavT@d7=&%N>n$8#ZP$2s3xjt(B5{b0pC=U>+z9d48GaLLpW7GOK#
zxG4DZNQRYhZ=;OEjJ@$3ULt<S!q~N<JLS6dx<f^N-hH=$d+v#G$?c;DXrYm}SM2Wb
zQ&`s-og95>JH07ev+9&Ex26kS-uyaE+uS_=@<I2#SsRn)HPjtf;8EnAq)!g5T}V^r
z&nSL+LPPNP=AStymeoD?&Z{fwkmyXFcSS+=#;=NvYjc+<`F9<bqa8UHmK)Ub(}3sU
zVgnC;)5IUkc#Mx$aJ_YV#mH069S^s=^}LQPbnjf1Z1-;Y0H=d@=Csg;qwQwtzRjOs
zJ7eN+$2k@8-oqi!hn)M%V^!s!j676dwdeL8-sB0b;CX&VLjDpqUk<WY%=>j{AV%|H
zhObMKn8@!Qdh9`Eg~;=wn`{Gaa9pe2vc~4CQenV_A7<^(jhY%uMwb6gnxQAu^eni|
z%h{{BtCFMh=IS?=HI-NFU&P$bIW9Q<Ztvu!Wh<TgL{#onz0zwve3`oU=h=!U0dDJ@
z$`@wS$MoK_)6|@LV+%^_=<0#DZf8iuEp^-G@370Tu|3Gnc2dd1g%U}N&Z$k2-Lly6
z@Q7zdOrcZ^yX@WDRtt{rGkMABs621)+Rrazt`GiL=-~VAc)+6Ec(LIaT1Z@H^A52W
zlCq&y-GWk|C9cRmP1_>f?d^7cnbI8p>jGaw<74FHlsDNs^SJFylG>NFyrseW;<dd*
z7)QWFRc7mmV~z0a$Y|4tpGqRn&DSsfr0`xymoe*nQEJ2~_x#oAk6RWy9le_3-CL88
zJxaMS@Xk%)cZuwB!|osPzqrHun`_^CYHh80c%`H9oBczZ-n9z1@B661Hy^@V<o39#
zm+s82bbb)my1jk7#$<EG){Ol-y~oE3XX-YqpLiGcCTxTzJR>ltNu;oD<I&1U>-L?!
zjawW2lJ}h+ytlMuRmY-_`Pql>4sTl-7`p29=nIWEY<JFZUz)4<mVS?3*i;ezx~bQ4
zRq;g!4ql_9jaygEko#aJvtvcTnu^4256|9BBn02x=9tiLd?5WQSZI_jwtr5_kIrAB
zGkACZNp<77I@~OB#&T#+FIU9!&5e&=4o4;!d3Tov=cK3^2}^%BQMpVJKYV@eqcv6L
zY<e~k&TFl!IW^+)y!}LLW(X*UA6Y#(G<#CrE9LllBN$jIW?vf5yyCj>Yi7IHwT(M(
zg+87Wdi5ej?pW%HuETRzULSltwB(6H5J$x)lN4tjbLA!PZr>j-J@l34!?TZ(L;YJ~
z^x4};LOk8{Zl{EY)B-MBc;~W@hNxodjWdhrcl-jLZ2J@U`j|}^mvwlZ^IlqCP0a_M
z%J)UF{L;_cwDx^l_gqtW{pqE9OfACwu76g%YABcQ!S*rb#<SS;iFIcB6v>B)t3|gd
z7W=6!m#U@Ej~!N)h}`vX#-fHL7b<JE&-a?p3a9s9ID0kihMu>#zWab|K+5hcN%L3l
zpGfnzC5)Q6-XFf%LEUiTsP8RRktoqdV_JQ&(5oAXX3m8V&wP72=4PFcHB!af^D4u*
zLAJfr>N0O<*v}V(63WG@^c~9En>#PvTX~H0l|GTrA=2|q#O;I6TbI|K?`7Z5$(y)y
zA^RFzbkNa8@y6W|0iI#k!oKG&B|7Sl8b2v15-)zA@^}Ry<YIKK+C|^WbFGkE=9`Nb
z_Xbw$J>JzWv1sPZ&h+2==XN|=9VKbBa`LV50pUrmLy>cm#>eUL3CgNUuOENkd+1HC
zk*PqX>{$OLk)+~R7ZcxJW3)uYu-POGDQp<-l&)*t=}BYIdlHlvMCP(JiNuSvJih;J
z#$GXYV(xmWfemiA(+1;z{mQb}t=hG*@B-Vy8NDK<+SU01K|Oo8hRRMbRORP}=cRBL
zE<H5RF3$d!Yx8{b<1ediyiO%qO5~P~e{Fa+@N?Am^iNC8x?@)wk2I*4U$0nNF2gVP
zYu>4kIbp*~s){y=#@AiZYmKaLT|tXg4^j&}$ad}i+0};DiGS=ZKlg5mwD&6u+Sq!$
z>9kgSVNz^^CiMi@?vGz@>Fn-Xz&Ca$bTNZhFllyV*3ed;LZMF_5@tR`$$9oC^&LD$
zZ+VtQWe2}1QTV)B-D`ptw&<YmncIr$rM_P&HJ0V3@~v4T4m)ipM8^Ia9E{vj$(U!n
z*gH7>K*|fP)t)-{_gZ^I<c{kEWW7z3TOBh0fZkleTWrQR-OgpP{Z+F%(&x%!!qcUE
zR^mY2*_|Uo0sL$$cOB`IZ0`xJli@foel4K><i$4|PRVdCU3^R`L4i}({q8(&>wV@y
zC(k@Mc;o!n4ArK=mKNcp2|0FwFE8sN<xJ1kbkqp2?e++N*p`1Jn763arYpK-gGBYx
zmWP>qt#ProJcG%fE9b4r(Bd@;5%cbr&t)HytJ1nf^XV4-qZ4L#)tYxTXN=$lVWEUI
z?H#W&R%v>bnN>gMTqwRY{H3Mgtv3z{7RliO#=)x>bFV%hBv`d6)QnGVyLFFY{RWZ9
zJ6uX8Mr*uzz85$3Qx$E~XIT&0M%{7iKC(|NzQJXqxJJRE=j@{LZR;d$F7S2~a@)CS
zH6BS$_@QbSQ+Vn9H{bGyX9~+BEeZoUB!lj^FnmhIzC80SQoNp)d}8$~-CYSbTj_je
zwE`M)fkRonuhX?Qigmti%%A^CJCkA6WGfNrYPxC6ZOux~RvT62;2qM}cc%aJFBCEB
zTa~SSU)S8meCCMphF!Dm`yX{*^-u3rym!B?O_eW|Z)2{JV3R+6)9~e1*{pXtsv?ii
z&hX5<+mfkKXIi`AT=cD*YXe%TZ_d)TNiUJiyLdizyRw~jO8tAz<a+mOvA)z3GK^bi
zxl<lT{t42n=XL5gRUL`z-fma9lM-&8<RN!r{?YE+9<DAo9)t+WZVHNWUo7ivc-hD{
z(fgvvH51#`XW?ANkNsNx8ns=`^vjoY%KV&JRArH^G4c9@mty>PNB8uy+;QTd!(6pi
z`cm1gM^>I-56(T2m0TcfRlS|E&~snL@8Lnsu2pRY;@x+ajsKD5*{bV$ZCLXV8~^tz
zt~I@vn_JBSN1aV%R~z5I{@pxx+16^m6Nd9-DZ-_4dpKGIXR6OnzNGdimVWn`^1R4x
z_ucB>2|PM-Ql>_1+*W0&YlV*6A=%53^Y6^Iusxc?w;=dTW4h0_+`GJP&jl;~rttQp
zn!e;*vT$5_U2-w!Unw`nliL$otDHLT#_jx8xLhtzx@WxLY^yF?%=7qSZP8W^mq|Z1
ztJjJ%Hoo}0zeiwf@M-pb!8ffE#}4nfR(n$~a&?_fsLcJFo!KG@$Fja;eqG~BI1w{$
zl-8e%$}#aeW!jqF?5p4V)OhJxleiYQ^s%ap=F0=l?U$abE|D)d$FqY&T`9Ie{?{7)
z60Q>*+Kcj+-G6b+Rp{G~4~w^d+7fWX^mS_U!r5()>yu~Q;tZ2rb+YDaZu7F)A0JzI
z$R#>0KN*=5el}V@oA>J9xRt$naWrEM(O>s(KH<6f(e94B(&HFw<wsv#IV@z2Pd8;g
z&EweKk$l6X=52iWGl`pf&nlfNZsnTOEB&{x;Kv(&{VxX;`ObvU&UXiz*oT+sy1$9q
zm={CQS&@@e`{b<d`y^8XU(@a1>V!uATv_Bt%bb<aP`Wfy&hXfcRGZ{y$IEBjvXN8I
zc_%#35WR4XRO%xoZCO*PIm7uDUFQPLKMF)^@m~u$A^lsqMJaB^u~qy_cUts)m$N-I
zVv;le6fb9D&2NwT5PSWJ3%Qyu7sbDAGutD@##U+MQ}ugnbjWCLu~J)8zy7gJ(h;fq
z*2NgDymKgF#kunDk*l{z9*B7;;64<gIp8hqIL}Tx=RAA;(OLGX4!j!8m7EUpY%c=i
zEtIwI#mjiEGg!YjdNd}$HOsDCuUvPjfj@Ou^}`i$!L+zt4u=bQ>=}ISb7pNlFL2tt
z!zxtmvI$SZ%G;)#&p!A0Y<cJ6w6#*6f5Td9e~XD3-)%~3=H?o4L_U|$Z<$&Ance25
zh`W|-QM>TDWxLFGb6$8F)i##YwwS}Yo_B+#s8_G=hXeWfF3P)&`zy`FZT4L}+0%Dz
z$@4p2`g>~U98$FtPrZ2V+#QQ9*@$=d7n+BQ>~-%kx8u1d<6O02Xo0nY*~i4=PbA8G
zYT}KG12?`ul4~CE*~Vx7ve?K1#Tm!$${8h}eS7S1G1rTN^^Wz|${xE3gx5ycNy*oh
zCmcIj=TE=!Zg#$$iC(M9&e(&^1Ip_Sj;6n^5j-XNxJ@VH+N<)}?T(q=C+!ZF@!9$m
z_>Qr&kHiK@t2BnbzF+CW_&MIwv8%E%c;xiD+BMSMALjV%;n?`{?vA-Vl?i(y<u=Zb
z((zE*<zgdqNY>s{!-3}1_<AkhGW9j<*FWHoin65kF4tH+uidL}C8vnQ$urFl?i4;7
zb(-AybpN1f^K9F-fyLrZQB_uh9AgR&)dvzB`m&?f9#gvzw{+7Q*QD7&PZ~F7$$MXC
z|5NDQ@NvbPIhsaSj-1TSk*}00{ULufKT>ve%Uar>&5jRVDEFIuZ{|x~{jFf^wC}DG
zyNlX0E)_m)S$<LQh5jan;zMB-Pv7M;^>|u^ZrpHt$}RdlIrhRg9{G@}+lMz;BnZ|n
z5Lnka^ys?!(-Hm`J;p0m?Kx~SyyLXY{i5eP!z1#EXWy%qnaP_zZ`rcWpXL)Tx}fpY
z^0!Hm>y_=k<$Wq#I>9G!S-c_QQFhjtTwJcsjidMazsqx;p9rjY&Ye#T>R#LXt^2?e
zN6JA8pWx){@3GaBiwj;?MRLb=HYF_^cI@sQKQTJ+@ZFGI<nlp_w1lqEN1|$ze^<%x
zwcAHr-_pAGxYKPT`)u!V^ER(-@0!-Uct-fYlMj0uId4`(NL@%oUZDGfyH6-7c9t8e
z<yvAVhDB_>-aj|zS2hUGOP%Fdn(C;Wt#)Olszv!Dm#o}Xi!Y0x%UrmfPQN^hduCl^
z$+_cfO_!oqa%uO=v>M3^v;SEu7HlIu{(Qwv%kKem8+O~953lesQIA!Yq1{@y^9M~|
zCR%!hyG;1gNWB(^tEWReXPu@V{8nS!7Ovkv|LhDaB0DEVo|7IZG&6`n_xRm7$D;PH
zzme*}QllStj+Gxv+1XgCrkc0N)q6mwQeXD~wXHhe@`2&aA1PzM1c&?0zH05x&z`kA
zDtX80kkM>oje)U~(lh20O)|s%3N5yQ+!g|1ozHUCh`r;FJTY%VQ%6_d<^1Es@E3M&
zvk%uLzKeB?`+9OQf5qX9y__4BC;1H;c2!b0{rXwouywMWuj7PL$6d#E$IH1Fd+Zio
zP+4u&ux4M?6|J;^cIhJ#%FRjz4pKgS{@xtw54UT7*eE}mA8*xXA9W^uf76o0H8M@-
z3eStAKH_`f&3BW>HnP1)b@l}hg%6{BSMsQ;N5=e%Hr%lneNlU_D|l8)fAfy`jPZR9
zyH16k$vg7hF72`H%zM0}*R~uD;M;md>2IFqO0(^^_%kOr3l)qCY_Q|@;dWd(PdEOI
zL`;m6Nxt*EJ6X{Kd++20H9EGP4*iyBVLE5OLxI+lPg(cY6Cnjs!Tgc;4lGl@N0G7F
zRaFvxm$z0%_c4F68vie|gGADqyvwE?nRAP>6jU{vM1O~6UYB+Brd3qrADY8fY|l0^
zUOQ*>{H-0~Y+K$tZB*>GeiZjdRor;Ve7`gsy&oK3AAj6H(Q-Xy`h5di!nVsc;suwV
zow3(dQ_lH#ZcJyM*k!NHpEA!!7LGnwNM@|*yRxRnXs#Teik-0ZuBt27cigPBcf83f
zbLsD;tyg9&YHfDxQNQ5wkx#E&<EQ8&qDxL@qvLV@jFZ*7UN%*V*p+Q+yke(WwD^VS
zND=qZ-T9^3op~39ck6e?p9(!~x5NIc)0HAShk2cx16M5_7I=<YWj_^<&69Xi9hrG*
zy~+t&iJQfX_leiq+Z`#mUV1U@Y2C@;2&MKS*L}XZ7gO)oEB$ySy<puL9noCI*Q*v8
zFT_mZtj+z|IzG=)x_a#F?CT|?9<m#T&+He_50%>NBDyrwQQY0a`o_I0`S((;`_8>o
zK0-THUSYFkyoVk)^oOl@UAo)LrL$(v-xD6G(luw!y>b5ho>!TIUAcRQ+@FMrHmV#x
zbJ^9Cx3O@Jr#+kJbFD@d4(%CNq>mQfZF<)2mOsDTZ(M@E-O>H^tBTqaYb_+K+BW|X
zV?XV4%O}vAewB9CZhvvR^0k7rR)2<dz@OUVVjoj~@(-2f+RtjSU8fe(CFghi*G!Q!
zfvU$NdAG*b*|x7KGWy|vp|-&=XY1Atg_jiHU-{X<-=N-`yOVcSQ=+cY*4pdEdGg8E
z@^$uS(5@Vel~1-#&IsBuVRW@{&x80kzGB5E+5GJ|{drsRmxn*9S!=42#@M_(tfo#Z
z>hz2wDf1;S8XDBC(-w)GDB_YTE7W0lG^m}*Ty|_*B+W8#ev0j+x~ydE!8|7~=Y8!p
zAKTjVexB7`Qm<T)b-!jlA?v$DFFz~pV-DkOc0zc+fTW#z{^E*kw>V3sk6UbP0*$$>
z!tS4Ryle*FU>9fQ=~o=oD|CFAex;{=X7sIH2Y)a9QajJ^r(|wCeR%k)1=l*;d#}ng
z(@Z#W${*Z_9Qmfv%UED)#WO?g@QD0Nwnd}w6gx!Hww<${QFryA>&|<T9|LBNcq;|p
zkyw)d%uantQ&LIxtO|(?t0kYb8v7U4?%?nh5wF>%?WbCF*kRMU6_?HHKm9yfA7S!B
z<#_Xg9+kM0?-G))JY6dAb(Tha(Lg!7Re@T@@$!bJJIY6tJt7Y&oeZ!ro?lP9O!<<$
z>1D~L51!)pV>gZDpV>Dj*5Dmy;jMzAp!UXsj^{g*>Y^@qJny>^aJjj8w8E)*fpgi#
zUkfHwh9YM;)CvALDZJ};8`mArQsQ@*yuyPWl1CE8)HVuUJ_VPKxAqwR?2Y_&@x{u~
zp=;{-{#lW)xC|w`7e>mTnCxx};u!yQ`eXS7_55Ikp<<~)PG$ShpGg15lkNSs6NN@+
zzYjHSI>(T_tGnI*{$Sfd_rDcK;{6_*+}9MJSQJ5Xn{83E;$7(>^LN$FaoX}H^D_q|
zQdT4=s7kJWm0N!=-XWN1dFpKcQn*;^R;p3T;2oLVpYM!{E^~jml^R*;|LVMImgAc7
zOoxfN$w8VPgG!WBKdW<x=L)81CT4H79(!b2^_6#1OCn!KZm4R-n2Oz)-G`x4htgME
znOS*9ZfB+4&o;T?-g~^LKzK(<rDgTJN{5SACaP1!^Htj4{IZQu&JEZm^+3$1;JLrj
zx}$H_hw~Qa${0Rr+ASq+eKB%UX;t=kg{zZqOR;NOz)uB-8ns(-lB>(d&+X-X<z-vG
zDK*<?k*4<Tknsfx;-M0|#^y^+z7SJcdg!j{@Y0H>@~2I_gnwo(IrdKAH{U$A&R#zH
zWxD#W{pU6xDd}eSyLQZV%qA@}b#%a?OLwQ+-mf?KpY`Z0W*A5F{5@1N;anCuYo&|Z
zY=t2u+XXUDpZQhkG_&tG%=;wdi&Q{DPGfFepTG96#=jpvuLx2!Tz%Vj@ZA{$#hiOv
z)Z->je;$|{F+AhGLC}F7y>EJJ(kNB$a&m?nW0G&L);}xk<&>7RqF`>~n`gh~9Oluf
zxIDnE(0i}%Is3Kr^IPx!sfw3&zoViq8R;2f`$Mmq+P5m|;Rf15x0t_jM}I3hJ*b!c
z+jQG<X!E<Cr?2L>6s0a4eljWh+d-sy{P*wo_vXHJTYZ4dscNL`ly2n-`ml}ho*gq@
zBwYCA^LxKw9^3BIKP6H)%&ksJ^ZHqBnekb1GXL%j@g5zktDhR&)2NQ_{gUAcB^xh|
z#NJz(xpc*4>8j|``k02WRoSbldf(p1OIKEx=^NY$k2wDOw(j3IKi_}s(-ZWJd^y2C
z{XcfhJ+3a3$biSN{kCs6xyQ%ceqBPJ8KKKta$dPdjNDh+@jBRB7xtAN>9Ka`sKv^w
zrnZ!#%a!AVS^G$Q&C0nVtjm&K<zKVNM?WW?e|-%$7_C>aw#|4)l^?M77H8S;WO2c1
z69IVd#lO8Q1x}TD+~k9|>1#EvGHt>O!+Vu28FW6CswtaJqAD@0O*Wp5;%C$0EJnv7
z_+7CDeK=N^hhZqAlFQmyV|-eya)DrZl&qm5#oE)OUpG^6Ax%6tiYFSugtro1Q<1e3
zEKL7vrozkcU8yoKWz612Wzp0=T!f>_Y9Ue`y}?Umn+S7P`w{o<K^^#2K6O3w5B{RN
zl#Ey!61-J5&SLF*3-_unryC{fivG_}rI?TIJ40`1haEy=9@cK%aH5JLeOV&xdtV4w
zdCfsS%s3dOvXq~7`5;k+F3P%exuBvbz`9JlqB5JEJO&^r?VDo-rx8kF$!8Dy=uCZ#
z(IOQe*6rKkU(4g5Ctj#9d%}R;oX9+=kOcd^RX0?UW|As)#{>#Vf=kKVtcZZul8nM?
z6&()pw)*C;%TwVHKm+*>Se6O~wN|BIYKI?wWUfpXu2WHFxn@|Wk|)f%{M?};La;8`
zdR1ztE*l0^GPqb9-#Z3X;EglnR=f3Am6ECR2YlaE;6)isD(F4@P8%BM$w_CkRMlo^
z{7^BU5(@vNLQHM9$MdUxVkhMnuiqrr1g>7=pdaUlLk>&my26b6LaOak+x6+fsykVd
zzuzC}*$+QxpEP6sG+&kurzd7HtY@p9n$2?Oqk?Mb)I&LoRf~98)KN?I%hctXWvX!S
zn{?gKNVRzC7{a)P>Lr3j)7Ptxu#sLZ;9oTs3azk)cVaPH(PFE5bgC7u4ytghhSZ|B
zSrvX+L!RCt-!F0PgQ^((a1h=vL7t;X-V1MiBiUqzE-y+d<>dv5Tj9*zS^*dl+Ki|z
zs)smPaTjn`rOjqtt_o3=Vs!!YcivOV^7g9=v2Nelueytu?C@eD@c1d%MSr}I6&t2i
z#;$l(IDJIA5=B!trUzbvb2Q}ZFPMz6KSkAT-v8W=EL0VsYZU&U_XdTkLo73zS5$3;
zNi*7eR&7&;1MeQH%-2-V=*&kLt@l(-1X+5fopfyYKPUYfRCjQ*6!kn&Rb#nMhq30w
zf`PrNBCJL%AJ)0w2JfEmHe+d57g7^pw0}??5G9RUqy^4?0LEV{GK-rw9YO~Gngud+
z>3m({mZVxU%L~RAK{a>}0og{ug=%6`VO3OAv*uyZFm*NGnXJo?25N95oQx5;WK7?D
zRafMHj>eEmZmm+g$0x|V{C9wd`BHHBOC=w*CG`6BYOMZdbgWlPn=%&apcXmRNx!zK
zt>h#dpfabR>mC%r332AYo@zkyb=?JwEFZOpQ@#c4QtO`@b(_M}<ax*^|Jj|v{L^<h
zmG@q4E<NC;?(~Z&d1jEVzK&75IMuTF6t!_N^65chf7el%I8K@}57Ow;hi>VL(p_%B
z$)~=2HM$rp5Rp}CyIA6_Zl%0!aAG8u#+qkHV@V~tvW!g~YOkiM*|K+PZR{u}@TU)Q
zOxLtn8bjWzad5FlhruUEzw$vf8PXqx?8!%)U~rie{}TteD@MOt3iE>T7d3ZwmWteA
zHKVEe`uUsM?Wy4APN?Zm(f(g*a0s1jI*m_VQiOGBFRA`_Dux>us*7=vR(odhzKnoE
zqsRF_R@0T#=h0u4!->r&i`5UYk=`<#mDH73hw>PSYU(GZM!2k|`X<)>2u=0BQw@W0
z<-menG<7(=!K?+0Nn`aG0oLWgwd&<lqHvyU0}E!@s@w1jF+~&apR@c9Kf;aP%Q|~9
z9H%Z$&j?cIW>~nWv&~|OuM1Ll<Yir62~+2pGEy3+4kyJ)dr?Q!sZ)mt$#EW4r!Me6
zIZVok9#?<Ga)dE)N?m**X+E!pFT4y!)HMn-F=%w=gY?byx}uCdW$N&95b{aGI(0ak
z%DkM+E7^V)PSzx|4jIvnj;qTtq@SvnO?mgSOZ|@!`4Dq>8`CQdHALtdpWvVj^JzLH
zfh+6EcSPNR{_~SMOPC?>S$#blY2kVL(chOK*^jZ&hxIhf>B*nex#&*Kx?=R^LG>%F
zhoiozdy!X+&tKFxPIa02usZyblr#g%w~@+d8;7>eg>_4y(zG&$@>g{lw;)sS-<V>)
z>RXPk(X6XNzc-;ilcl?5LVXph;~A2_)On^Z75}KaO=ZSWHVrtQ&s4-1=g@$2LF8pS
zuLk5Ud1)Y^k)_JI?AO!a<sf}ZdTh3F4s+^<KQvke5hp|B*r_{@;k;a9!PJwHh8pni
zccc;_6OB((s{gFgsGXt@Ej09aSOT0IG#FE~!cL=(!lH?Z8j4dH0cmGKUze-_Z<<Jg
zpSh4)1(P-WgjtvSGd1F;x`C%iBcFA0)~i)eO&C@ewo^Z8jDi!WjKP~4@FO29W>1oT
zw`|1FkYwrD%FqzuB%AkykXz;mrDIr$Ic=)at)(=%7}xG-1PYM1&CCwj+rgX{$xnZh
z3ct(&6(S6mI*lOK(+~7!@Vmg-L2)Dcg;Fgo`sKIq+Z2T+4Ig3hY16i|gAbrT#istJ
zn0&hBr3M_8CtH=@rx84pl)N}z@uCpU*PoinRHRE^`wq@82aRf!OodYAr^W_Wf8-va
z+PlNbQfD>uh$ZPTKjXn4jonj`GT_lH<L0zv_VRBb%`Sp<*(Ii_KSjl*G@GX`KQGkW
zMhG!wg>Rid-pD)>4*zDe9gdVi7-paP2<ydMWzFX-k&h!Y4q3y{^KqctXlNVK)jz>+
z7tVgv<zsYdYdQ+D?C2S5>dawX?zGcv<Y!$PyJ`k<lXmv6apnkycCHa47p9Ef?V4A)
z$UBl2vz}jsp-NxQoJv*cfySD%8QuPxcX?Sl()MV|vn02Et!NK{z7FPvb%-iGDgy*(
z?$<oQLFzen{;7!x#QdigUDJ&X&h<xWhD<frB3`qdj}#i<j4UXCq<#p0XZcjykg?;0
zW(_-e=Z2Z=TV?3*Ab$Gs6g5@4%!sZMgMLyI4u_L5Z8@U}zhWXUS7&HCPhI9`X+D}V
zvZp|^hn*#`>8hs9OxESqdzwO1T~S%13GV_YHS|8$yg8+T-mPiR$D$+snrf`JSQd(X
zTLjJ|3b2l*4u@+>GyFel!aIJ+=L^1Q&SrH^;-f$PvJgFn4)chyA%lNRQ-_yqP`qKp
zxj@j7Ai!$skx9+NQ;&;tYDIIfG^G<-(o<en@oI%nd2KANr90Jy`xj~HPxZtdRV~&3
zOCW8n`z%*CjJ3000y9ShnIE|f8NX>-@G1||%c^Bs@DHKnWrC5`{HaItS84^zkRC=P
z`R2I8qwv=#<WfPHLr<FFw_Piklf1pg^hu{2q>R=~R$@SE&Y~x=>v7Y!vgz?N4E(fI
zree5brxsO+WhggNt7<9_{ZqBLSfME8KXsK25@$INE1Y0RhW>~Hlr)~#;-7M{D^m-8
zF+qB(lB2b8sttz<v`SbFP0Tdk%zQoncK87+qz?TqH#~o;NXug?e1ETNNpf+nVU9m3
zhE@nW>ylooRm9D@w5Zf_o}vZSTBkULn6fh|1(^|`^(EI-_Veh?W+Xk-Iz1(+T(31(
zh?E?8y-V8y*3Cyv|3@;pL(7AorGDbAmM917Qe;3&Vd`?}M=fJc7LEF%wPcE38PO6E
zAhllpBK3V4^g)Rb>!5U$vL^f+f{>mNL;SB6EC!iAejlsW_Jr^`@iE`O0dH9V8Ac(y
zcG#3QWo~VFI~dt=R{`y^DRa^y+VEpzk~YuNmSd@!*g~1O0jjil$(~@yNNEpDY2laE
z?w)$|lf3puKGvhFG_>LOAf%RhiuOto(xKqg=T8S9GfX6yMbnVsyh?jwN@$an_PHr7
z6E@nb2~wV8<7HiG2=Zetmes&b+O|{it=p^(=WI#+d$(%C@0ZBS5ANF9Q!SG5(RP~B
z9TlLxg^N^JZr!uJA8O6%sj7KxtM+WVvxFWmgR)zDc<RX|;o7G;NbxV(lJzy<??p*g
z*SUmiOVE=gL0mdY`|*_akvQ$yQ=ae~)-K~EpEOySH={-$>Qz5k)ab*J+R_Z+6zxuZ
zmZO7B+GA5b4?NW_naQFT-e^CcqG1Et=2He9e%78dMPChT8?p@CX;^-U`LpavPWtuL
zQ}Y}>Kvs{B;q_B{{Zx?3e`}|)vW&ikM<<+Rq=!f69S`gBp@_~IE<tAG>^lxEF=D1O
zedkRcSYb0~&CIzv8>SSEi|aVDlMOTV7HbrSy0vQ|IrbS*^L30^9=V&n8s<`&F{7p9
z-;BgnqM)Oo!m_2bUgzpmgSs7b3Z^_rbkRwi3Z8_A&YG#b^YPU=$TD}oWB%@a;MYFx
z>1HxCcIX^q-KP5!&hC(w@w}|pB$9sr4b+L8(it7BQ>(?YZ<(V5uO?uwEoS)1)tkWA
zG)?C8Cn7ppjPwE>=UL>cgE4te=Q|JUa-voTK4~%K8K)a`7V@$#E1v0;h_NpBe$qL>
z&AOZ%(b>oP4Gw+(A00D7DJ)fCY$?2IU4kM-{wkmT@sG|rqAJCMpNXq}P^CCEK{MU;
zuZ|_*C9fd-Pg;$MpZ(R@Mz}jXBBe<sBzB$DaVC<jS`hAM;-{+5|BFxhQZ^I%#l47r
zp*RtOw1NLpdJr36M!{!<$Ji)aiN8uC2=8a7I1nc;eMk5=J19v~`+;y22W2}UBl#EM
znVgi3#KJRd0{?A#5+}u(FlppMcn%k36TxWWMffBaWec&EUkG7IZi*w3=`tJP^W5Oz
z75%vg%M#F>E9)c?&Lk-IMBWN%gctCDZiPn+5zgcR2ieytA}q^GaVPp?ln^dt;$xl~
z2-EmLiNYTpgm3eK-+S_P5njd*5sdL#hVWf}aPY<g6B3iDroqgqs!ECkeWL({LJ&RL
zLjTfGoq2C~g{}m>S%9*fSU6{a;D0+rN08!2JUn<4S*c@UOZemB|4KUv0ruyvM0i+;
z;z?ZJ`T*gGnUrlrhjjzO{K6D(V)Aec!fC=34?^Z^8^Vi4D89tH;%<cRh){e8E}=IF
zTg?L9!p8>??wLgiA;Po=5e}bC*+<Bq`-<>_IZSDZiD^uR@$MY(WYYK#!s|rAZ|-{b
z>A;fGU82zV03jZPy~F?~Z4*TJyBPSLSUd~igL5f?g!JHCgcr;MCF~he2p7zQq4@QQ
z9Er))AU)9+r`Qo|_iRDZ3~@+On(Y#VO(dW>j-S;Ke$2#o=IJ2pC<!=HULWCpCcb#d
z5aFQt6gT1m%?#n+Ozd*Z0^x&F;GovdbqI?rfL7i%w?#N}0VR~Em)?Z1ku-Qh;lPb=
zl&0(<&da(_OOqwoR|d+H$FhslT}<L(8K@Py%{e5V4(twDiY-womOZV6jQ2-baPWY>
z55jxppp_fy0}$p|NZCR7&I&>J%tEN|+iry+Jbw|?iVa($5zbsh@giyx;t|%6hdg;0
za}?owOdPc|6=4en7~J2R&LG^Q0Dkv>&Oq2p5o&X8@nwX+C{kRBs`vsDPlrl&oC^O*
zv6!-g;QS;`mM3!FBjfdZF}PEdeFJGayo3@!@HgH@SVjr5f?8RD3=}AVmtJX&NSdNd
z2_i&0Y7uT$hH>>?0ZVUJ0o*hHDbhWu0y&g^tOqGcRHf`9R*ZEZte{5OPgqM}=>|1Q
zIN`PbEz%vJ4!W<N6Gk!lr4E(pi~c91`@9C|Zj&5A*hG^OO;`y}ApA*_vYS|&`xoIu
zS};<~+c}ZR#oEyL{!(6qE44vdbpdwJMF*tW;76eUrW$e@a_xbsW)_9An#eCuMw((M
z&==MjG6+jiVF2X2D<XW8i4Rc!M+s?QHI3p=yq4EOc6(?r6t9Vy&HS%BYHMLu4A!L>
z5?p)MPa7aLb?L(D=Z2>~!ftvnu^iVnLU>dUve8|06~c-7FxLHJtq_)23R6WI8#Z}m
zDGZLlJUdc)dhRy#rAg4Omr;y}i?=<IrrXP4P+s$QL3qt_81QGUv2^!x7zMlcdH??z
zh%x{JD;49AruPP5AlV@h;c!F9optgd2#Xs*d!>i>BAjakN-mt<kMJ^M@KP`_7U4!?
z%1$D9K_ZE#!?4a8!VqafF(&jg3eY3|tDqy<S3oBx<Rqhe5i4NsTt0RZ;hCn8KxT8#
z|HHh@ZtwO@XepwrucnC7D@-X?L_*Ppe*&CjUg(%X0tDISAY9ADgc5da<4VB($wj0z
zY76HoXp6>VBhmzIfr{T>j4+2eG~*ks4B=FBkWO~0L|ADxbVS*y8iY$%gFAvm9m19t
zfL9#Fp1dS+ObaPJ?R>Ni1pBZh#ek^pd4n_!S;A6Jl-7ap{x#6kOQU-b7PNw%&N0SH
zvaKjCMB9UR$S#c5O+?^sHgXV?^8?u-MQdSx$hkR)bT3;65l}t-9pQ#`FvdKre<Eya
z4G}msIRokLw}u){jo?Cc10t}InAj~s;@1lBd5-jiZ9OzzWmEtuNm>uhiR=_cc<u&>
zNsHKAq&q4Q(!6Q|#f0$CS3!>`+k%(;@zMz2vxRa(|15{FwH;vBXa$7dGO>oBGQuJD
zP{9^aHAy_(*<|ribb#7$Vb*f=NRk7zpgLU_Vd;%fW$s>EhH&0SC<3OBO#c52IBWs~
zdwAEQM=Caf0cz$Ngx5KO0oOPiggYI<K;njt)0kYH`EQ2xZI;Frv}pUd8I)A_tVB52
z306>{yDSl&aDv*>&u)!yurqvk?5y5EVlrst2Ya$q32uSmeeqM(v^(UzkS#Ed7r~z;
z{rl)dVqz<-TWV@O5I(ULBJjf53t>4IXsX%n9SE1WK)ODP*oCl-E9gFBy$9iLS6Evl
zrG_Ky<pw6p9!4QN;RYt>JdH#6h&!0f*`J88q6a9sRFQ-*!vh9<f#)fNt+zp6*dIEJ
z@QZCQgwy3Nka#+}-kvbv2V1@%=`m0ESZ3l?k8tXCz~2@=B{4Zx$#`pd!2)G(e<zZz
zXW~t>UL)-04Fk_<_%p(zOuU88HvI(-sr#T0)EM#2xk!4ZFT}gCv<TsIzECxSC5sVW
z;s=cn<uCn*d6~M))BG=9p=bKS{Pfh1vW8e=aPOZ0>r17T{_tGi>i<bQaul@rLlFv2
z-G~y{c?W!Sq4PQ;+_eKrZAO9%!cGA&lpJpQBHS4OYkjE=K?rXRgtRkWy$9isflxf-
zE5i`>*$JPmiq=LUJhT&1qx)Jc!h3^Yn2xy|M)*$<EZV}3A4NEJ7vS=r$4NY$Nc_Q+
z4aD|LS=@cW(0v~S`4OJE8(NvVXcodJcSEmU5EMgLHU#|6<dQ(RAOv)G2T38Uvj=)d
zKpA^-cMs_Pn!adSn(RySQ1E-fA^QK@90~f%P>LR5-nxDI5i%tW_JZ@D^Ohi7vlk5X
zgsCHJy$?n~?@Aqn-|vG=_14!%I3x^aj%AKU2n&Wo4t*S5h48U(P!iv=2I2YpVO9z@
zwMDpSKX{_D%@N^M5g?tt$_3#U5m2L&Qausg5eZ6yF8WVnGK}0&ko(fU=cdD`#r!%f
zHi}|FB%fO^^4|hI76o0@Mcp;6gbZv%G-Pw=+I<NBji#(8<Ss`b9C`rKR$%!7gnu(}
zxyM0-qhesvIaHfKVlr(>2Y<&<))HodX44LmI4BkdXT99(X(c2cV&d0B{RsQT!50Vz
z-hM)OC=R6WyN)32cM$s0ymJiU!Gj>pUi2GbpLhuD&tx{_;QM%xe#Ai_yzLO+ph+Qw
z-!QSD>O6$E9tPdpVx<s%c^IU_Pskzcm;lmBnTiN^B!Kj9M-_za5<z<Kp$5XOi6EVr
zu7j}k5y0M5U4)yM_{FQG2wNNl{CJlU!Vj6)xxf@*(_?@aGt5am-Sbt)AgjtfERgil
zBq(q0P1XokC&8G{+?R}c$2=L5P2T1t!i~wWlv&_+24TzNAidyw8p6#aUY?2Yh7+J<
zXXYh@pPm3EWdgYf+oyo^>a+rc+ftw?IsCecuxlzTDcYTGAUwjvI+7&_N1p`UM@8=Z
z7oR!>BkHqC^M5HloQY{oZ6y9rxZ&FhIz^Xg{8)y98$^fi+1r&I5FVvNO9N7z5e_{K
z(%CcI5dL`@q(v>f5RNzl(tnct{)<nYg#t8hXV8Bs{m@wuX1ohQnC~2@ex0-*;bZ4O
z+W&ks!oq1F-8yg(;nXycPUKERSnND_B1SomaQb<W4m@xYVVQK$?dp0C;jDC!c72?I
zutEk%*ZW^aI6niV>t5y}tda>>W>z7>S2ICLZ1#18wJv~kxB9Jrn3q{0<F_m37}6(C
z!;<Rp1^C!y9D@5=_aZ#^YvSI&=QzpAT6U3Hp7mc$XN@*<{-9o>m=npHTc@Q-oP7xj
z?rO$;gqLJNo`&f^LO3^zVnbBfJVsddGDJR40PD`X3=UenZ6~G4{3T7o{-OlkESsW9
zSXh+I`Zt}C_tLZBW5^uum&ky64#k!G&uc^(Ibhc8Nk77dxiFRUE%}Ua6B7sR7(sY*
z9t_kz#u&n%@?g&H%J_wFWIik&kNB}6xA_WSm3XLyKsc!YD&Wm80tky00=ATyjqv$G
z2$!VgJcOmMz$7QUR2t#TE09b-)f5n3bQSQ8C8`MLUIml;X6hiUUIe{gZ>5j$4JOXB
zH$<3n4Lp%PVTy1G6Hf?QAWXXsOOT2k>kwvK2TzpxZ4oxO0n**YjtH0EfSB-jxFBqF
z6QtMkcp_YR6Z~%B_Ct6@F(^q^3PSinG5B46V-LbBZ-J6mE#U}1V&ahY0|>7vfu5H=
zdI;esB_M64dkkS)2AG^2KY?%?19H&d>}iCz-G;tQ-I9**2PQr&mxZu@Dd1=0c?ge`
zf+uTt7a<&6#vGefw@6HW{v^lWSQ&gi(9pXxEzNv~Jw56UEbUuw`Au(^llT`C8%+cu
z9CDXpP271IhVU0AF7S*&*z+FLluF@5gkRl*GPcU)IKrFDL3(#59pPu?;K@GqbcEMc
zz|v`d>m`JnDwy4{Czr&iFnLu%VS4epo{YfsJ=m#UM^w(MnGOsYFWo8#W9+5-D2!!H
ze7x)tQnIWXBG6^nij)*rL(h4yeTuN|eJEJh3Oc8i=rH?&ET6si!OK!NuG#-BpL#XW
zFxS4`=|@N}m-@h$ch71l%LtVjL)gj#Sk}1Q9zeK;i5>QTLfE<%^5f0*FVmRx=|wHH
zfV1TriOHamnEb*i^dWrVlo~Z-+Ew!2>xZx<k~ubkG=<c`M<`vF-w5+Q0(U&Mu#%IH
z;B)KSP)<_!v|U(`O3-!dVTGjdT?5%YT@SA22Jj+0zX5v8#9k2L^ahw>^x@xt|HYYf
zUZD|+SZ1Oa!dDuh7^J=tM_8)~q)l&0Aza!7(&@u82pc?x?~%S(E<(8BG0Z$xV~Pmd
zH$x?)^{F8IyctY-?81$AZUK|h+jK~2GG5HTw4%RhfvqoY;b4jXwsON0X#C)XrAT+z
z6HxM{z!+hdr%-5QCd?6j`xI6ZM{?I8?A8h;ug%3C;jgXm)z$h8Cxm04!FGbfA6JAW
z+F-SiQR{_pK^sJn-zNZJ-REGEnjV61-E$~35xn~mHf@K}v6w9u;p%qCxR%Ky2wQdl
z-hVX};b$Gp6-wYagq=DeRJZPDBK)coI&EiHE{VySOeUM>3-G&sZ6S%tPNOnAjZAXk
zE=ck)Jq|Hu&ykN%nD+v^AS=eR-^}^fp(Gye0xx;u`VrpS4PpE`_anl5FTw84l0k&e
zzJ%{pB)5$qtl0zA%F*~c!WBKB`w;C1!q%@~d8MQF2jRX~Fw8ffpCS5RzXN;W%foNF
zoCr_$LUfyQcoB|&4Li20Z3GdPeFM?${w$1e?i)}dZ7>I6jki$fcDm0+xa=(?V%Y}?
zgsuAk-`cSN;r>22uOU_{i*VdKz#ct|5MI;|fxU0F7~wnp5W$V($_TF+fKcVFS4X&G
z00vNDmJY%`@4><4f%^Z&r#^tim@kI^rF8lSuz2-=8N!P{Ld8BLWr6U$k5I8Wj<1`>
zWO`bEf<dTJvjO3kpP-Cs-*iCO>oa7Jz1e1j$38;}K3=gE;e&(leSOLycZ8+CKy%jr
z@I<)e3$*=Uoe#omh9Ct!FYZ9NcL;Lric1i}0mJZR-O5M15hg~UD`^IM5zZc=L=YbL
z!x3Kd6&&2<9gXnYudo-8zcdcv=x@+Uo9aUdON_!Hi)gbW2;UtA2YI@a5Vrjet?V{Q
zLHPZ5*dKY>b_(H`F&H}AM$RHEH4fYAfnU-QE*l32FMYj;@P-LEd(qO7jqtY#=!RV*
z`3N8WL5U=`eY%RU#!oPLI`#&_O-vkgtOQ}7Ul5ZT+cJa&euFey!aaoZeuF2KQdI~W
z{-GoiD?Muv{`!Y<j4+h1Lpb*@<rG12Y(Us^k`hOp&VBq}d~!B5j+oE!6wylRQR1v+
z+kYv2HydEia~%jPu~SbHty0|x53qyk_O4e5r*cpa5`l$p5jNqZ?jvsX^&{NJNev}7
zn|wq#mW%2~EIvMn@H}oR^d#p9!dJPe0mS^mQH0G1Y9KM+X#(Mw1a%kDdixi`p*++O
zLQ7#1VKH7RY@64xlk*(8$xarT%e>Sef<2sj8j~}oAs^L^=zDz~O{9<bsCGm(TM5D|
z`Kk6qbx<jZN!_H98h)xLv3^z+!s`X7j)b;*EyB+PsIEl6dn3Zmg4C_VnsZMP?h~YL
zA_iJI5Z)$4-9Q|j=s~zwi0VO{N_>Z~-%RQj;{Mi82!EXktxQQDK{!B|x|wJ=IF9gF
zVUT_){s-Xz5vn_p@_~JNekNn`Qv{UUS<j2`{#jJmKadk9@$|BiZ8mj1@!EX{xvV4?
zCuFC=;zWWDBf^0Ady{{<=g51%W>bBMxydu9*E1wOHix>Ku$(m;VP#S3UZPuiE>hAc
z3W?|*Ac?TM7<C&_IkEuZVKFc%s<#l~n7JUG(5Qg0@I0yuvFecu!l&nf$znlGgw@5V
zUc{@nRD>(UVMwS&=_BkQK@BFB@fslfO#&KUDq(_fvLw}qICjenVY&H$Cp}gpT+GC&
z*R4oA-4`pRAU{>IZ4hpff>x@}wnx}*0pJ(!HX+?F7l0CW(=7;lNK^fZ*~V^2$q#8z
zvMyvB!qGB-w<~)i%qt56Kd;yi=|0WG>>mP<w4xl0*-Hk&2;Y(eX`*@$QewIgOxAjZ
zA?e12&>YSkk^e9c4c64#zlv+okLggw=>ChStGFM<{JYJ`yiM;~1j^=09zxh%9!%e|
zJn|2d$`+bzlcl>VgE9dHsu?l!NfLRVS$xR*ZbvJm=q_4tKShDMiYT^C`S(1_8oMHl
z&kJpIgpVpxy@_pq&LO;DF-SXo&irSDr6g}L)RhITSqM{>Kq)(-pNnug6F*;Bh;*+~
zf_DGRDnhtf33{R_;U>b)$`ISkD-4ADltH(f<{gCnRY3Y^W%)l$24_`~;Rt=F3Is<;
z6@sVDsQ!1K)vhpA(DtOg7GX{`=nDSSdW2KdV4!~%evGh;I<(8P=?TKQ>d>xVX>ABo
zHNexdwhj`La|oFf_cS2zB4@jiw7Dh(zU}raB>hYiOfK$vi?EXxG$&v9J;Lv_U>0^c
z@CoS-(*`9o*oOXLGG=XOJ3Z(v3n6AwIuNq~!EgWWvtpK@1CO0qKZfuu3dGFl)(?a)
zP(b>k>|cbJP+{JcYhs`Kf1gH4+dmSc8(FrSs9?LMh5O%qmTi3+XuIynk8m{&Y)7gJ
zlXyC&HoB1GIp%W^exnO(6&tI02#4sw$|mWBB+@OW57NTVrIB>LK9un-D;FYcyc8-K
zb&n##&zDkn5<f?k5Dr)d4tgw7|1UnZ92`+zs{LO|&s+|w*|=y3Qw*SU?_SbJxWNEs
zh`YW92yZup`Q-E*6ND!XA+t+%nIU|{2vYc5gayJ9#vr}^wH3lS#t^BIQX7Pqn!u!#
zIB1J-2NQ3lZ$x<a3Rre7`m!0}xu)Rvub*2HE;5Ax+Gw~VY-C2=PgHbzBK*z_mO-ay
z`5^4O5}I@Hp+CYCD<P@s7w$wjeih)M@4*O*nL`g3MTR0=WDesuZDTmXOIO1{-B1vP
zaLsC{4TWK`2wPgf+92^;Ji=WT&^tHJCL-)@3HU)lGQ#7Q;P<!qRD=(%ff2=DbsAwI
zD=>NQY8t}nR?wUkrkMyUt%bo5$axvzs<qH*X2ZD%Z(0Wq(yOl^Jg^SZM<?P2iOF?`
zE_1dFvxY@ilU>QQG>L_5Ano^>-a$Ca20j$nm{lNbvL2Ryob2}ze!Cu$%`>eQ;kXUZ
zoNvDM2+y~LJUPi|BJs3?dA1Of%E=ZaO|gRrmTR>k{J@U7huF*2iLi$~HG)tG?nao;
z0ZIYq^<ENBo6L8Bv~P9nLwM;%Sh<C4d5@GdZ3Jn(MV}GgvIzpa{q_*TBb%Vigd2TB
zx(_;nlHT*<2+!RN_{`W(q$GDU^v-$tNrVlYKzF4X$Grcm`kZ86wmN}yPzE=`-p<gR
zzYIQve=+gdl|m#Yi#@62=oawg=?xKt7i<NSHF=^4-`om$qE;@Bu#F4kpk;~_!tY$5
zFYo`5MR>O>B!qH|Ji_d5@Od>_PYL1kZlL>YnHs{H?tlx5wMabO#Sh$}IXnBP2-|x=
znMv``M@oh~z(I+11_(!PgD!64H9>fmCt&+JGlVaDGArAX)d=fshX`&mu|oLKcIbxw
z*)|Ao_5vle09%AVdV!M5+czQ{=?#8guicFBOdkkran@FZ(|jN%H}80mcsf+7zMwn$
zf)|pmX5!}#eh54Gf$rw_fk;W8ANW0H6^!sMe+bpJj8KHRcR+=ZZwo`Z&+mYkEV>bi
zuzCP|_pl=@2H{5mkoGU*4<fuZ5Of!<NI-Zj5PIJC%29-qc0#gQ$Q(ylDF^}^)02vD
zSrC}yoPC<a(-E}Z1tsjp_&Frqy$h6x4rL(h7Yt?nX~ZRjId?+@e_hByIAu3T+e8;2
ztQ-O+iHIT+PkX`$fziPC>Bc|I!yMcntlr<I?+k@C>9#%8wFHG;^6x&Yf^_X+241cV
zVgFEQK-z<QB%Ur6e?pnk`Bexf?S*11r2PP4^?eX{#=|;<>-JIOiT$~a2=57_9w2Jw
zw;(JVPE8<Qi$6oSHJlnl*zImdIAT9Fk!b6Gfv{!-HIfLI>_NCA0z7FBd5v&<BsGc%
zyxTX8^_cUidKC2#@r2_&!mpyBApY3%3E}<GFkR6lh7g{A02XhL9(+Z(^Z+Dd*v2u0
zZDW{AJ<Fd6kHmmUccZ@uC&hxv^ObDk|8{mrzt!R(GuD}KAzU2?eHr_c2VvWTFo=9p
z1Q7mo5C;FKrU=5J@laSk^NNy~nT$M4lRP_qf1_*qLe?!j1mCtf(8d4V=OkT9KLic<
zeN77CWrtx^@m@;~8R<C;<D>kj0>b_YfWxmVA^bZ53ePNGHH4EAVKhDF(?VGB2;fgK
z6cUr8iFB~y2*ga6PY>Y@M?qTf<T8XmG4VcWBZOm)!616$xdP$YNuc|9`$~j!lAu4H
z@>w9Pmkfm`Wv>;&&B>tKvdtRdt;a!kw1O>($#F|I=i6~`P&B{+;lvZ5B>0je!jdU~
zXP(=Fa6t+L)~U@EVS`jAHrs~qlT_%P;k{l6yPO2<TjY!I&`D_J-#0rDjynYl-a}GB
z2+yWN34ENm8{rH(=ytCRMOf`Le4tBl4oA5DG}MIbCD91)JOhJ2G%XHc@v~6k`bQ5V
zTzVFk$8K!L5OzEVN{lOxBg~!#(%LsqB77+gYT4J0GYGFePd!F7*quk1Go6}5@V&`I
z_)a<$+aFq42-{}B44Qc&7vV1%5GvwGA;SAJL3-)$B82B&fUL@kxQTGV1z0DjekeiM
z=pxjF;=N@EKVjle)%Or~y99n`NmU{I>k>#G>#RZeNEYDc=sJX@E(5NSXhirb6aRhG
zjId!gU_FCYgj<;S_OIs%yXL^yIiT?Z;U77Wnhcwl2*>Awbj875gr)PK=RI2AB3zjV
zjX(IKAK^{;kUku~9}#9NfZ4#YZV=)00@#M?<Nu2AnnG$Aap1&vgog^LvBYt!9|)hj
z0;PBN>^}%EzY2roZQ6|KQ!8XW>0@HQ?;Hpx7J-rrSp>qW*Pud-iSr{|c@46vty2hL
z+v^}5vTGK?15E7xPIMZR$7=T7fLhh-A&xNLO_*Tlu~G=<GVysgIfPdh!|a=>u7L1s
zCQeFJLOA>uq<xZ(8p0wafDe~xBAi_US!E?oMVQ6_yxdj~;W`GG{Bd;|!tS>rPxKEM
zA^iI`6eaiDD-cdA1y6ixS0Suf1~dC50ZWAI%3#&7Va_^)x8H#ZaU@_p!ee)!LZs=~
zBb<B}9Go<CM0m+P(EU|x3&M5x!0)pIt_XXTL+?05Y(tp40_wM1r#HeED`3iKyy}mz
zZY3l{pUAFhOzx`HRl>J5z8PX@SIxQ#zD}Y&mPEU1<WFC}RZ)$I6@4+&n#hdXTnz>u
zhlC;gsT!v5;gCp#_uq#xJLr#<NYudhY_E;ur=`imLlrf!@!1e2JMATjjUNCmTBC^Y
z{Rgln%T!WD*t!;W3csdkA^fZslnCt6McDo!NN05#ApC-ft;S6e-ckpxJV~)cc(@Lt
zdotYy;m}8rh-E7s5EiTl>FgFKgiqFkgIjjFAw0K%>P+~~_d+<W0lx8@JL-?{!baF@
z-OC<?aAqS^58<tQ5Y}!2C8b*;5H4$i^+k?B9KtIfgC|#d5)f{94ByDd)FmTq*9_A0
z-=0MHMKg3-txFoh+gl(e25&APJk$d9V25}P!U<2HcUGqqA}sV2{8qHRf$-U<u%oOQ
z%RpGF6=DK^w2bhzR_aEA7hcNv?^6VsgBs6Zq?`+>MY!}Cn7ql?h_G23^kw0LCkQ`m
zgJS?+wzeZ|_8c1TUeHbA>Ah|86z;%tswQ#6V6o&scgRs_)D9mko`oEp9zZ0%(+->d
zjhjy(ysU$|j;P~0jWGPVrZpi%NhdM6_$QSpcLM&t=@OE@)Cta)hvXwH`vN>!9#n*I
z+6$28yIhR0L>Gk7L9YzqOI=hOA}y-|;rZQ=%|0J%5I)mQwIcYV>k$@uNp&DRi57&9
zzJ$Ku-qMEf%pM5U?$S<#lY5~1+pqN?Ec^;geth#5;kZ|z+vDqdgo$2gj_9>Pgrj=F
z<ou4W2(!J0R=QY>BOLJ>0{dtEZ-jp^F)f=7B}DKWh&QJqL1Hq8jF}(W$+w>gyroiz
z%92Gg%p97|FY;{`F>m3J#F<lCNZp6Gu;a0~MHS(IJ{Z)0Lp2c|?SnyW<V&44P9{?H
zJ5ch`S{F$Z{V;PW@h_W}UdD74UK%AqXY^C`iBY2rsei7L_sG}PXb(^giCY6I)AyD#
zpBv(bljp?)u!ntnfzSMZ!-TxI@IBP)Vi`w-Z@q_As$-!u!d4#u$G&w%_}vGXmwV=T
zA{_G(YSYbJZ-f_of})?wxr4-Hu*nw4GT-ym^ohEHP`SPzJ)-*=*3xqp1|wYi85W4n
ziJ{Y&)Z{n_OQ(->{znOkKMg`t>06?a-Thynw&?36BPCozpycqEg9x7*f;Cd)Py)iL
z!{ABR`6Ps^hM}qTB`GAHZYp%U1U+zsYD)a+I*A_X9)V?*pT=4A2-$~mU!f06D8=ZJ
z(XUXaZA&u{j{gS6{p7O{o<9mJi=AUR2v?25M~ZSr0m9zj!JXGPiV&Xo4kkORZX%pG
z28mme&mi%1d!cKc3B99LNIH8Qg2u0S4`KZYh`hZ4mTs7U?vp-U^Z)9WpzrwsX=9OE
zi|&y@`~3rgmU;Ohvf}#_7Nmov_2?0@YZm;1t})%x^6wE|W&*AHE77Ase?3AKrDy$u
zVy9c(j2<RmVWs_>x|--KfA;TTPBQzlena9sbjEn;9~dKLz3oU7IT(ijK&w_pyhfTf
z|AoQObowR2AO1o#|L|hz{gV*Q*h_uL0JEJZ7t>4$g%{udtLS7ZZ95Sh_u;>kzL1T!
zhd9&w8R2R+8tgmW8AjNFod%s1f>hH#u+w}Anc^R$^ndb0f_{pFwu1Qii6Dy?`OOp=
zH#Sb1C$YJ35*bM1r1=pt85~G@Ar}qKC4^ul#a!Ty)Nj7&!bs{SqwK;Bb|)w2AWaS2
zVBk`wFv432Fp!mirH2VH;L<b?86dMMjR(T~MQjPu#KTM5LELMTLO7il0{ZBrEW+A+
zU?9d{9^nUkv;d;5ZZX16{19i&=gK6Wjx&r0SK|IU9V8tp0J_5hH4x?#qy-WNKeQ3P
zC<qS5ub@q1aw?+<(Y%Nc+l<gu_E3nnnW$cBim=5@8oYHwz<e5$v(b~8G&iDfk0rvJ
zg=tQN?Ur>2zY?ZxBP!H3AnYeXb0L1{Iv_mC#E%>{BOE>p9F$|*itywt@MM{QJHm%&
zLpIhsc_KV_4m7?qzz5-+IW!MKKynAd+M+af;?lrQgv*(@GITe>tHnTxN6}t{+n8AG
z+<t^z=Yrqc<)ab)G#8|4PH_nDnFrkvu6P*Xzw;oljKCuZCyIk^CC6lhMI~rki09oY
z2%nPxlSQ3$gcT)0_gm|82p3DzHWAOd(h;W22M4RXFClz)KEyj}Je$PR`Av>zPbnDB
zo3~m`=O;PVpGeVm67LQ!L)d2lWOKTU5yF!TU~rZVnj(Bs8b*r4EpvnwWk6a(1e?4k
z10_VW-+Ee_OmZ7p$T-L0eAK3AvJkK33$G#UA_wKz{&+FMLvj#C`!NQ>;R|7e!ylI-
z%(aN-OuTHZL^xp)Z6h(uqy}L=dFYEH`gI5=%7c<K(G3U-D?r*Bls-oIoC5g$^VCy>
zl@wtpe`|k^@Kr?^ocr2dAWU5h&B+_@LAYWuc%th27GcvRkSwXe|6g0@9naPO{{NTA
ziEJ5(C{26zI<HVDR4Cd~Qc~KqL(!hvoV53plBV_^8k+A0?b1@Tqw#y3bNZf_bA5jA
zKfd4F=k9S`&+B<z*K?fndQmCCh0U=U7JYh4aEt>2y#L?>!DS9eEW76u!O4yo@@LdH
zg8w?gNr*$GfJq~6b3!BTdA34bexVUfot5JlgNkkh=Q%4!F{4^X5gh8GOl97E?@jPW
z7v&UYLVPU2%Uy9u=aj|~taO9#f~kWDzTpP-$umO;PIO1U!rPA^SknV>?>>}7@Olqr
z3NyuQEWvF(q4v|CNboaHsB;da5<J5Ti?bj+o#6W3%CSsA`80x$@Yww3EP{KsP$o0u
zLgx`&*#hd2=?e+wlu)N7FC|#TLVZi0Bls>0wQ4Y}<pdSf(`v0ztA!PA))Gx_Jo-0j
zmh6^T_%Z1k>Z^>Pw)VktJ9k!#;MG3p=KVEw2yWzy-F9JLJ%V@mVigA5Fe2F74{e(n
zUWecUKLlvwZAoyeR_Oh`vDO3^wnCE|71<Kpt2L(cX}&$dC9ScP4~H}(INl$=Jt`at
z{^SpJy^pR0CkNoD>EP~3uvQ@AzC2Ay@a#ZkFQ%g0hhW_{$e=;T)&wtZgN=4-?KTA4
zw8hb`zdc02!cz9*AD)E=>vL_Dt(m}e57c!Qma<b2lD?>D1HlJ_(9MZ$w-6i=tc=(0
zv_nk=|HZ+$WflM4Mevvq94Vo$dkNNUha;u&$NdDaX@?~{BknN4?(MOC9@%-6-~t}2
zTuu_)rUTMdzvK+TmpdRY$5Ss5+%psvYSp}u;Af$jd%){!1gC`I+Pr?jO@j5p(OGAA
z-X(ZtIF5HC$7<;oQsvkYg*)5nky<URpMxDSo%0r9q@Mnr5a5Z>2!bzn!iqR-(Up`L
zza<~dIAj{tmtKzuL*9(Q{aXL>Gxd-H9vq43EQ@_baAhQa$9VCE;H8~0ok45L2)6H{
z#IOHtuORq%7v*GT@UE`}59q2K!+b9MDaTpeu(OvB{4J;4if(AkeiJpwY#Mh*Debz|
zBsjl2dM(IWhv4=-5Z~#AdIZ1dfi3(}FwNAgp2$?}D^o$OUOvJrG&V}vieV;(5l?HP
zP|u!emIOQWLh#?}S`$yg<?ebfoT)nI;l!k6G?vBgJ`n_8i^iTaag{B>(Y=A4ZS4tu
z-5cSI``(1$F@2D<*i8-u*NDMYu>Berf|td>iPJF;g6;YOKhaVWoYNP+FFW`U+%gs&
zeW^ujg3rVvF^4a-A-H2dOeZ8Ngy4t$;N)>}sDRZ45&j&+<o?P4CfU6gF)8hj%W%}(
zZUm>sp?_x_i6U5U0CuihetpOQn+9M`l|d;)?GcZQ%)aRU1mBH^6Z@e92_7*}Ig#lW
zK7?SyK{$p}w~rt=ZxD{*)DEKv?wEj1wW~9Z;EDw0bSCv`GQq0{<HmA3eX@Yn`NE&=
z=*~ni=|p{a2>;NdIgP0Cxgb768#|q-PYp#*Y{$(eYT>U33?8OrnXm6xlb;Fy^2YOF
zN<XI6>NUh&yWyyK&zl<vzB3#f_~SPj1os|+9nZeQVuGKJfcjlfCcz0KaUdrx$R@aa
zBzj<91dS4Zk+?VGbAA(1Yb9ZE{JFPH!0KEIFR=F{DO)n5`=28wCZmwK`L%WvylfQq
z<$5ji3APxGaSwh!NbpJ?FG)X2u<aORV7AFAf;WvpRg`6C3HBI^3|t*>QNZfp3&tW}
zA@(<kI$#`D_O|&~3BEcGODX=&4TAfR$H5Wb<qq*JyeHD1fcHeDd+!pHi4(9pYbHJ*
zCc<{ud?L2PDMb&}CjR`LW!Xe*hYm{~t4#zfv{Ya+T1wAZ!&W-zgr99qhRKNRm&B7#
z3N9Af4c-!bEd>oY<5C&H(W#j9htLXw%TwXo@%k5nr%!@!lRZBOwwMej-dleWoHH3t
z{*)+4-2Q3k%iEo_1S}Lwn2xZr-lkz?HBr<fCLPjo>4=)AL-74{9AIZw7!aH|1*f3-
z2@`^APsO+ooXiPcITf3(_FgN38%;ym=LgjnusTYi?ANEE?B9(W5fh*3INASOWl!*(
z>3C+_@T@8EBzzxX!3?E8(=%!fG5IzFZGOCmGr`kmBF|Qx-3hKc3!CN@J8y!I&O#Ud
zc+V2tc{b88YP280&t@aVkH7d6oHPg9!h^;^1Z&R48IaqdJ;5vIq99h^!wB}6he#I2
zM-Y6G$8*+pB{*h2u4L<0^d$KEd^GTR&prfC&p_Pm4#W~{x&Z1Q8u0|LTY$-L`ZSne
zw}t4`jt(OTKD!Y4c6gISaMU8ShTg@o1ixK`ND}fU5<GD+BH8$I62aO_FiV?+DFm-w
zf=JwU&mh=yDNfOE1#<|#vlQ#`XN?Sk6PBSmIeiw(aaJaps^Qp7Ipr2)A|=J`vk9)B
z1$9N{YJzjJ5TD`8^#ZQ8Ecky!pcjWbV@JEE#N;H0TYaG0HiA1W#{l6oatOY+98KZ6
zbPvHJviW`LLO#KoE6{K)J|84_#R@p7ci;%YZY!}dm<605xNs%Zl^xCy+;0`i>~-)w
z!Jk$klJ$=-6Fh4*65}%L8o~9}z)8jJ8wBU@xcBZm1pBQ;B%%B76MSne7U7hZB?QN>
z!*s%TK9S?B^{D5zt1kpfDzsxgDiqMVRBpi;Z9s*7hrT5y!o73f2HZP;sNNG3n~i8+
zhvVhMMA+1dHeypt+Wv=_u$yofUTXV|;4_<W8ut&ZB)HRNsQY?RCl5AbHPy+~P=D+%
z93H~9JY@^E<!+Ul!q0?{qlBN8Z9!5iv>3IC@GS!Txo|6P^;V^dskl_1nEcs_E6nJ3
zdTK0~OxcDT<h)fT1T)*w$)_(^5}di6zwkI$^DQ{3y8~;sYn;8H7V0DV=09LQzXLNk
zJFqeF<gpVoa1C#+#=_?}7k4W0$NLWHsoz-%xJ{0-Gn076nBZeM%C1a7Jqv<WyYMQj
zOG9gd3w9|xGDce)5bTx<b>ga~YApOg!@OK2{<`3bT=gFY4CF60_|w)_T-)7BFXqe5
zoeiYhi!k4HyOAB+OU~*!3AoW7JTv<ob|?5GkGsTq6WluwIA?^4;2(K-F0ek}M{w3&
z{&As3Ai-Ywh{R%PJAx18qu3s1VFb6@hts@389{Ifj}KP7CpdM#asqSI?I*$32k@+9
zYWs)aV+U}MCsb;ZRK*@tPGX|o)*`s(Av{8vUDYRe!y!bnDcFSIfWuhh9bcFeTzD9_
zpN$n(1a~dKS(cqwpWx>OxKLkgY)|lzBfuACHzBy<2v)Xofdj#ljsh?Bb0Jvg7@U;O
zcPDt!F+61CRe2F?avY5q*_tJI>2X}a_PzE|W1$h9PXK>NYfbP49w&SaB)H#6;K~8P
z1XuCcS=E8y%v0#Wq-Eg*yPpOQw~HkB>S>hWNmf^adz`_?I`<m(B>2BG${vix@!kY?
zKMR~TCzjwlJnj)QfZ(p@fD2C#BKQ`M-#i#faOd-Q=r@QMN$|Jxc*Qj{cND>^E?|)q
z^%_TT!;8R6bdm|)#$)E(B!XQo0h_Z^2;Rrz59QMdZh09;=QPJT1Q%XL2CdI#5FA<v
zeC_uVf(r|=!C$(-5!~?#u*TL^1mC!VjU#&HdV)t@#p3QByM<uGYgolcT2$NcgdW^`
z4Xe2M@I7j^uyyvht_))$JMU9(oo)E7vyjKNrrc^M?V5s<)<u}kqx=JECjvfEguUHr
z>QRDyZUBewKSl7t8#p)KPdZPq$4#hhPZtuL%i{xiMFczC0&bsnhu|%@_;&&sj|pye
z8@23R`GVkF9vA$5OR(1+_#V={oZvk?-t*=w!A^I975-HOZ{u-Xwt{4^(SMk{MV2<f
zYyX2f$Uu+adiQ|4bTKBF<MEA|76hB#$BU;?%dH7sd>?z2!_+1On>;|X&1vFDaOMN}
z-s|B_u<=9S5Ys?{GkAQWeS3m+ih*~x??mtn9yfU1m0*n$;2AZd37*7bi&?P*|K#x~
z(}4t!eT3<JX*5iYg+~7I2)B{&y`u;o^%yysXE=f2&ySJ8I{nfJPI`hNCzs41xQxf$
ztmhLv_$hFR_hN!y^7!+nEQ0$zLqik>tRlGh8Pvst*Av|HIdEF&HiB>SxRTjJaKsDv
z9`1C2;KCPBdsrMLIOHYpnyOO-pLq!<p0zF#-0Bt7?O$IdxPZrpYE;`egfg@!1)k97
zo?0#JK6^`*q0FjFhshs$Zu}azzcbfQkWFdjYh^eSUF={lZAyY~qc>Rdzpb9BeG7QW
z8)cmK*tdjQzg2c)&O81fIQuQSC-2!`0SjN-7JOL0gF5ydLq^Pghf}nzk3PZn@8Kk5
zlPSSl-s92C+Oi(OEz96!`&fGctA{*WhI7%dDV=1S4`|r)2RzkIgan@bfQ!=EC1=%{
z3g)-(E+3U0m~Jx{sQ=|~VLInNVmjx7eAG?^99WJ7#;o@z_-r|bEbALYa8Ly{&GI$v
z2`;R_8`J$gBMFZ91bqKm4}$M}LaOXy`VidnGtQT~On-v!@py;VAcA{-K~rryFr47q
zU-(zLKSmSW^DEYNzDY8{cX`~XX&S*@zoF|VWKJje`ZxHtvzbe9m+wgIy{ZKS7k!75
zS>BlhhyFmmv;JlieDwz+DYjUv#%=jphW<nb&lYVWxacQxvOhA9;D}08r|{W+g6~%1
z4fV3;1q8=b;re}Q@d<)Ut5A8fZD$Ei_=Q<scz=oDm%kAArzY14j{6O@UAx-^KlzQy
zU!HlN;J7~+^7fEN1i$zL-wwLZ3GV+Fv;1KCn&8)e@oZbzzf6uZQ`jC%Zi7#RR<hlh
zy9M9nl$)TzMl(~s{vx<agYCzBE7xcwZ9PJ5rq^JHFat&~1e<F@efFe2!D}_yc&6@K
z6M~(z*q%(XgQb9ljX>~?4@%6r(^_mW^Eb?fn7Gx1$&ZHi1n;SdIrZh566~UlIW73=
zL~x!qi+|K*ln24y4BLlEp2ZSez+*cHKZ0A=f|Egw0tr4-i;ZD^jSnWcy$;)-c|JRo
z;9ELyV&6A{;2ye2qg`cJf}iMOmL+rJ2#%=@yt#h@!Ns+as_yrO6Wm)5cxUs`1Q+YE
z_#)fs@dWqLM<gbrQVD*g&-P;KcuXNU!GImgj82|GaFqcY$GqG+hv1opsLn@bA;CIE
zY&T}R`BH*s8nF>f#ygJSKSpqpKV}udQ;b<$7p|@+__r}u#D}Ua1gDx{@}oB95d7N&
zl{X8|BY3JQ+nGs7IzaF*Qv^8b$`OLc)B*nObCTfqbr5&)rgH?3G(-F3ZM;nIS2M&t
z*6X?&2k|@P7;{82`p6xEKbXUJt)UMHPO?DJR&b9A{%nECKXH0N@EA+r^o-X8f3-xj
z)k!ELcywK;CkK8a_zRD7_kSaJtQEgpW2*@MY=z1<(XQ24+G~VaCe}kOkKfiO_<cRB
zo%#Ds2u`#{f4f^+5?pSL1=wtz4Z+DaC~?I5h6Mk$f$x%U%?X}XAFJWv8W)1~Y*Dly
z+dK$fV#{`6&M#|0u)ZCtvqjT~;6*%+vTseWX#*5(d(SomXEs12re*C2uHO*&^007%
zw>E@t$Gx2icCd$Ux9pw-=kWOW`aT4^Gy)#`y+6Twc>K{ZfnbluNbHol!wEjv7^|W3
zZxX?-O^|Qb%y9(gH-VGaz9|H=P0>EJ@1_xapea_vuZn2|yES9+UOsF#!6%!c*|rUx
zPq1He6wPY=B7zS!XCs+_Im-xkcYu=~!Px}oI-p~d<5m;w;D}T`e7c_C97ldh4c$Vp
zgA<Cj)L<vUyPVM98*C2|?Cgw{wq(Uof^(hG5NjMy5v+7UfbR3p5uERW0G~I$Ot7~r
zuwK|Tf)Bc~J+zPAB-GmtRp@5@AHlobFv*~m4+(a5=jZ<4V}f_`xR34&f}49Fkc+l&
z2;M2+7i9!Hdt&a}l0Ff9z!SZuIqo~bExnM}BfF~zF7QHLTb%f-#=>)kk2jn|57$yZ
zX9)O&H)>`Su0wEO3-s~)Mg|05;PC}FQ-Z^km}Pz~OM<T{5y=lj8-hErNL8cL4G6x<
zvV9rGt_i_iRcP1&a~%l2p~4Q9`qzcv9xZX~G=Awx@ZFYdEK}vc5**`$$<KE4CHSci
zW*N2DpWuPMh<oPzAc9}`qB9>2?m%#?AG+#xUMGSd_@NA+6@3YgX@y$aoEt#!BObSO
z8%%IaYvg2J#}NcSY>l`ZmyIGg)*nMwG@MBA3x9S1v*^zxf)fMKgVwF55&Sa%CFUB>
zCO9P!k$m}-LGX`2lwrW&r2-Z{x$3~5{^Q$VPdaOR#F0NP1uXO+zPi$bsn>TaaWb+k
z0^HqWC&5*1(I*4XuTuLKoJ<WuB(v;Yno38pfd2;JCbzK5KH_9<FguKiKYWZh5k4Gb
zL)f;=<%_n&)20x1I1^fQn|M;SgQw96#ROk%2ldOPPYLed9?uocUcMsuV|#WqllbBt
z!E-yXiHwzV1;GtN*(4_4@*BZNLNR2Y&k7QEL>N1ku?yBFxGW5D8{X0-cuF{Moo+@1
zn{{MIF&RV52;SWhaW7X{5!|5@)M4lA3s_xe;WsG%bYj~xAC5F5CL<#-r&YET*NS9E
zGFP8E5%t<gG--M7^FjvH0|?~{?2PhR)o@jtgz&wW+ZhvU_1asFh5iWaf;IE;^fk4o
z_Pkp7Hr3NEtRE9RzqGm3Qi7-8u4t*WF|E{|1bm|_N;|q?5Wz9s(8U?)p#;C}hAwuu
zk0dy$JC@nrp*;xx+#S9<U5OE}FkfLv;cH?mda!Mnd*;80NzI;kve6myN^K%cO!%$Y
zy*=3w=6>33VPb;2cKoCpMB&~sWdCF`-@GW4=614nGkF?1^g?saG^|bVvtH~t#=WZ%
z!RgV!msZswxPEVT3=>&Ym*9fl=-G_M^$G6LhfQTBDeVca>cdW8mcMRF@X8o=5)&8d
zM6iEfHig-_-JRg)ec5S@)~6N(FNkH+m|H!33HI*CPG-i|4<PtqKXwXJ*)W*k>HXoP
z@Kz^+9pl&;Ot@xGg73w#$xLxj48h3*5Vu~_0ctE1%_bhcpROEC@Y#5FBGcJpB*F0m
zk*cpt#}Hg|5UOLqO(b~ZAVjjtH=W?11SD3gY&yYj6ENgt%ee&48w@84(ial!JcLbW
zrcGK#@SP#>y~iz^;N+oj^1js?f~|)k?!%il5?nA0k!*L{MsW0SOn&B|T?AJRM-RSv
znMW`;0(;wvo(Bo`7zy9w1{@{$)<^_+=g&!k$0nkyT&JEV*e(f?Y<qv1;FC%4eM5Pj
z;C`b};sG0O6I^RFn(bNqeS-5wW5~%~j|uKQ1_3s|`GVj-V=(09iEjyBI~F*;V>!Y8
z<IscNL4OE-J`QF0k)f&HIE2=mG9JeOv$qz(^(G*alw0})ADaLt73WO}j-H4zWbdm>
z@VAL5+ViWn1TRTOV%tSECfFl|9nYkmbR_sj3bvUCy6yyzNJS)TQ(F+MHwiiU(AJmW
zU6atUO%Dc0*r0b}c=Wmc^P4raQYfx&=6X*?8cJT?*J6B&IsVH=wtRLT2BwX?X0A{q
zk5wqTC>klm|LpMpVmRY8Jo+CV9wxce67sn+jkT(=I-jdZXM?%lX{<5R;#8!h5xl<n
zJbzz(BZa~tP@xEydG+NY({T@Mxud(>!ndQZep_ROBBi53(LrW$jw?^c1wLz7v}7U1
zku(KyBt7Bd=)f5~b=2lcrm%X9Mrte(8c&5V@iQ+B;dH+^>T;8(LU@3QmxOlw%uFI{
zZK|oQQ0SS_s53M<<3(&s&aJAM9#>}?YsU099wHfw;Y3cuD8F^83G=71=FCmokwjR*
z3oXp53HO*eF<h_di0R~#F+{y!I@D`ytEtPUvvrtSQ4@*KX$FK_npG3FZtkRGs;l6O
zsZeMrEETotRy$OMS{Cv~p>EZToM*CinXqG11@G!ZdBjWzhXqN(n%tit;(fXGGASe>
zpLhLdF;{dIiop&FMGsj`i>M=RXt0wpmof_}VCKz{+=!XoJqu$SN0PC*oJc1l(Ih5j
zfn*|#{JX+)++%%(qQpa?h>(q}>P6ignT;wP)mus?oO2a}&P<@{K6B9TH=1Rt$K&ra
zm$b*0d6_5_EjlX{v9j?#{*Q;sIcyWAW_GrEJmE%^b_L_DPvOUF%Wa#BM8qArCXJ#c
zXo}}Tqme3VeCMGJTio6%Y3z8-ti@UJw~+Jo_6kKuStJ4JqTT&@tR>^oI9G0Wpx1yO
zjnMb`Sg}zuyYzXY-NgB<J+mQlpWJR+moVK^wH1miY#m`TyL#(IyG9wv?~1wway#SD
z*=gG~6bfxkTA|=wyj=(GVg}lL)3OtCJAK=sTf=K86o<7Gif*#;`s^0%hAcn>H}*R#
z*@?N^xd6Gd*(+*{7qX_zT!TwQ6Sfc<!_%TB=d_cdZ1v2pqppW7pKI6XPkzfVheE{D
zNfygm&S(){3>-7RC5*r?Bf&ygiU0nOuVjIZQn8N9Tf~Mk59U0RViGLI%x-Jg4uXbJ
z3Pp%yVF!y!-l8=(c`<9jq&B!OnTgXqwir{{dXJxqiu1hhWWhCGf}XNXD<Q%@OCUV@
zSQNH<;$$kDbABClJVJlvth`yHWli)JVhWYTGqY6m(Qzqj$NWrqC5>atKO_`;WWJw=
znsKmG$?TeQc}v-L%#DyT^61cD86vCwiTCQwjl%z6CKP;@G<LjK-KF=97NCQg)>0_^
zWM0pG5$(diI_YwTnV5PVt17v%{^0VLo+#ZwgvH8?bARy0uAJLXr`lXLZ){W&FBxkI
z4?R~i5uLJ16k2AnhD>k8aH8py#ad|w)^_G9@*Q2c6<Mq`)92I}B6`G&+ztMTnsX2h
zJ(@y9!#RkejQ)up8abPAmpQ~0+I<>P{^6C^O#Ue&Oq~t6_N$?+IyIjt=P!qHbCZ9{
z|CX~h%(|hOL}Z-}QU7NDM1!)iw)7^imP8HsTr7G0MQ4c)`oTh>2$Ut#%;TRS=izL|
zy<UzuhGuV<ti??FuD}qC??2@nUuS(;3#N?BlMIC6=50IPAr@O)dIyD~rEIv$*8iNO
z`aA1$b}KQQ|NO&p>-JR_YeX^%MFzHOdDZuJ_-DO#C6?6vA15V4G0sOTQSqqGT>2{3
zjdNJVnlKTQFGw1p%j?$;`8EM96@e8ZcixdpTZO0H4P{p)3(@O`Rq$%vgZJvrt?S`z
zAeL)b=^Y}Rv>L)My+q;V)i{ROief<{oYKOZE&dDe8a=r6YgjK?4%3HkSHJ8Mj*`Dl
z-c}l0cx8LWDgKjsvGPB4^xHYFFIGiIJ-Vs?;Vt~Qj2JXv>RROYZcLeEEABjd)}ovj
z`a5%rr@44=f7h~&nV?e@L=?6Tq8j7WB4aLR9rpXU8s7w^FeM>S{7Rgarv0RUO67Xi
zh<TXwo46af9`45dpXm5{wmx(H+Am4efcLbINnD(Tlj<5yM)lsy@1$ij|9S7S0fnzh
z)Nqhq3<@D!+X(L^Oa6%xH?n%%nh2#HvuLWOpcKBjFDM)EN_MGQ*#h6bXa0Zg*6jQz
zO3QK9<C<?mDxDPO#9g0FaQCyC(u#|1=dH)&^fDsKN?!Rsr@GI{yU0Q^7+@w@31^Gv
z_>g!<+>!=3(>r$nm$w;r?!x4{BzB=1w=aw1SOf=mb*+BCOpUOUMi6p7$8U3tF>cgM
zBZZ=kY^gRa{1>b3O=pHP+=|54J7FtXi>=ggD`u5_L#;I9mM_Bmo!uLexcJ+XX5r(1
zN>>9HV_E-ueG4M4lee)}8jhUCHa3WRu#IiNd|2fmNrc$_%Dv9q!HsBe5Y3MNn`+~5
z)3)>Xlqs%~wHW(}?TBd7-|9-+9k_KIDDfo9UOTX`x9fB353-&dLbu>f?m#<i*JmZ6
zaP%f9w@=E!VRsK(t^87Zv7uya!o}LV$T!fr{(f>tu9M9!JjNDSUWeYxpEarGD7mRi
z9j<l`LeY;6kUWU9Yn_9bTewPUV{UB@(ok~>&GSQExVcp|p>Jy!`BO>Fs`i3+^_my%
z!bTyYQFJbA%-!3CFf^uykudaf;XP3;^k%NvM@Y_utYLji|FxA`kiT-dcNPcOGup+J
zv)YXTygrQ}19aSt0hU%5?%9p&&GaTwl290>mqtHBeFKG}!c(E>DUD)>Q9Q97^M%DI
zb-1fNFw}@W7-~#FjNC&)?5Hu1QQdi{uKbbtK)mF^jML6T4{3gilZ?fB1mq!3#Sl@r
zD35huuD?u>G(w1fd-mxy1y$c|N^fosqoo0?xX`_bu2DCdh_t;JpiYV;G~;%sxYXj_
z@#@p(Mhio!3uv1Uwcj*e?Zu7EXX`OZ#ToK&yw4>jc^RS!{S}I?vW%_zXZLYGYsRtD
zNjJZ!oG5q1j@q_$KHgv~?5R*h${YnQ6$20)JzDCbE87z$KbR~z5|dc9kB@psHKW-5
z*q5$-m_~$a_e0q7lq77%1V+v#n$`!P$@@>#EIhzEGi~aU6g<2+bj>i_7JP7fl|Mgx
ze<9i-b%tEVOBVxK<ekcv2!rwY{+}LKZ?k-l7nSmkBAMuknIt}|L)ZXu<2Z<yI4Orv
zp^MSkl2<L^HgO!18f+Qw!IL{)=3*|3!m#v;<Wh*D$(FyDqA;CyXfpY9R#)@E9X*VV
zBDY|z+_=;`d+$9Mr@&bL+!|M|4uJbx?!w5T_&8&e+)>kppT>8>-Q_9nF7hb0ey--|
zcLAbs?YdoV?3&}#;0B_|`=5%${itUA_lHX@SriRIawSJ%FFiPd+^w&wX4Lv9o}Xv1
z`9wJPD1@JMxErVcC7Jtp6gy1Wse_VG$nL>bmcO%*91hnPd4i1VRWsgI&sCnF*QLkg
zjtcMm%-V$no&28!nc7!#wA<cQKF9Pcr{#_w8S0cw#^bLF&g7YPZYDZf<KSw^JwA?`
z^6K*QYGb~+_C4+%eG1!M?f;3w(m^t=FH3S~H+92_4O(ylxeIoY^cI}mNo<dKABu#*
zg#Ae9<mi(S7Q2bU6Yj1C+@X`$xQ^bvB}FBAe9sG2?vl`k>v;;oxm6E|NB&M&!$-*r
zTg#%{ef%}?S?@HYS)!C1cN#m&fc?)TjV&Kw<BR1Jt&oW}el(Q@T+wN~NqA+b_9|Fd
zL{0eK9ZwnO@fag-hX5|=3_eWk)$%<Fng85V11~bq;%PgV(-s~GgsZENqA6$bQ@k8I
z%ldML=UIk(d=`<WHv23^DrTtuIm~8vU(Wazw)T%HNafm5uG(C)ORNFY;lJ;a+K%_S
z_~Nm?U$B-|*QXB>x})SC^khlw8~saiBuuw{V*ilMI2q&dDzUw6lTYB1&g0t&L!K(g
zgeuM>&@nTr4b$lY-k8;?s!4?NFYu+DUrksvA2TYyfWo&a(p5h=33(C<KXwtX_U7!0
zU3Iy%ix7sTH<8>5Q+L0&&U_dOv<cg={DoRkbz_4|Y%^wjPh-hg%yP^n#FU>^ZIr{8
zSX*XbKXbV->oRmjhpXwLFXJ+6bFrSJ6QYSc=UU~0{5`>yguF(&P|f)7WxT6%c4#0O
zi_!QMVw8*<)r7V;UCp`NLQK4(s<EUNhAQFu&sdJD&@4B4&shFY@?gn@UV-tbR*r(P
zdZh|izwEcXur2qP*Z3HYA{yab`YCAIa}%yY!)mP|8sYVA%sbvsdy9bXJ!Q`+HGWNy
z$NcE%!-mB;E^nh?y=ConL3F|8UPD^5M-jWbYchUqgxvycgMDOnHf6%lT<_~_L*`xD
zEGaypona#Vr(V7Mf$hVcx{lSGcXp8!p<uCTtk(9&__;owruxc8+RF7SVn;B8nyD>>
zI#g&qdAS)DYZ06LowR1T7~9%%*Sg&C+sI#}eWnzkSaLpp8cn!<H`$J|m09$KlLw~k
z`LVABf=a~7l&4@JZ#|5O9<WOC{XdgezcS`0-+(i^g|9+wyuMLB#;}%|_lhu$o7hU^
zW6a|%I&r^m;pLTIu-Z(>z+s)cs~5pc6IG3o<?L=HpMj2C&TYIYIG(dr8cwh~so(WW
z8O|o#rV2&0%<kDQ-mWdTNYBlXJM#dGWsI(eWGrTG?Hw$Zb{bs3E4Dee=pO698Qg_%
zaT6sGhTVm5uVyvj5necCu&*Q(!ePIqkNSjZlw<wMKSo)iQ_VQxKb*5&dIiXhFYFzi
z?TP1mg^>m^{#m=4@mf7MZEjKtTERpWEO`(o6nYPFu2)IwK;})uP@<{iHMzc$rXJVp
zKHG>H(<MR@3bi+}c6n%y3)TzFMV{K0+^PHc)rgw&yGj<q>|JXm6^}&$4RD0Wm*%B_
zYQx0_y5TSTKfnZf`A10}#0h9TL~w^f<Z2_XZ#y?#F6ALi()0Telij?@)DG0dwipF^
z=NU)TeTtzz(V42R7Na5`8zo3;VL~ZJO3m)x3dPPB^ieUn8}%^01Rl2j9VU4YCzMx$
z@q*`3b=f>OJuaiXg&y-ZG)XW~@2mXbUn6OuWTMAKEJDc-J;L<YG~Y)Cc+aaZ614#r
z`WR~OVYx(|_84k*T{YiDkFi@CXXHylab49LRx=Ej#d}!m>PK-qH0PPzYQ`tmx#@G`
zpWsE``kcd(2QA@yQoEn<)5sUqW%(%JpC^bZC495`n^WPqcyRcWYB(4V@In_Z^(i!Q
z@yRaISBHe7e9u!jvpOhhRvvUS;q;zCxT?oV^{9f-{~3g?M?~SCBX07`%CaBli2gFK
ze|$mI$6j=kf2PtXv5@GapF_X+im2cG9M}KZ$8Qi#8L#P7B5I0C+!(Ia3j{kn`>rGu
zlAOPMWotV;SI4!cZ^z?bh{i`>;G>lC@lPc?v8nz5*C_eSYrMFQFQNH0?iJB20<KZ|
zlh=52BP-pEWM@OF>w8HrtoCj5TXc`Zn|iPIG?rGJ|0}$C9$i{4S^R75R~X?Z!=3tw
zDAH@WTXQz0m}btduWDf^U*O162oD;F!d8avdfYBv=-;i12%qx8EOTDy!?CZifO9%&
zIQ?HI*zwN#-a2JE5@*<fh6+WnY{S28E!tgujSoHyGjt?7Eg=YlH}JjHUeYk!us8g3
z<tRf*C=6h<fA!)kczAo)n7#)!aukjKzCmD<D@-LjaR9%!7{Jd{)VRK5wYe2<A>1{x
zt|Sx&(C@qQ=PkV0S(HlecVGM^V=HFM4_nDj93bf(25{;oX|%bM-Q2ZhWqmTCv1B9+
z*421=G=C4#Z9*^OzhXs4ec$6#teo1c+-_1>AI&k?%ER!sM*i7h;vmURhtn*>HC?N=
zpJXgX>R*OP!-tE)hr`{CxgCE|@WHbJiLjU#_889#eL1fWs7JU>JEBSa08M(Ts4<-6
zZo)nI0B0?qMi8OFM+jrnd0`uF_(yC?JAJ!J8e6^#Dw=Bj8I3M?8Ax9g*W^xq#B215
zZ&8wkIG2EOczv>19Dd{yclrCB9bNkp{X<?~u}svbXS!Q(9V!s+?VE#$aB>BN^Rgu&
z!<AIv2-&hBQ4$IzER0Ur{}WrW5gs4qU$@a(`!A|bC}G#MadNv2mZuNLqVx~3Ys;Vc
z<2OlmdRd#@bvV1vh%nM7Mebl__isn0;sTHgC-Tn<J$8x?az3MCWACTS?cN+|v*IJ3
zDrWb>3ntkTt&=O-`G3KdwRzD@x!t}dmJ4z))_!YhH~5%j$8dj-xij4BFNoM6d!gh(
zESB|G<TT}MHT7=unl-ufuV{wX%QGdlkk~zrr{_na$7|w~19_Y)?}{Fbz9G&}8&}Hh
z#@rbk@DZ1y0DS*R{`$8wzN5}}GVFA@cg4u;v2Td;PV9QggBWK89M{NsC8~W&-3>X1
zRuFD@yO{{*e~0kV8&Np?Ew;|`?+|`4-zf=&xy&2%LYa%)W#J2C^7T6KgJfKb`~3ki
zW&D7LU8#E{4`NKGeqg9cU%8~8YyemPCq5<b`{IzK5k{HSdh*IdwBlU6|Bz?w&mZ2d
zJ7aY6m}DZl{rD4ZZ)<Y>eqm1!uEgG~+3&QZ5!|*p5teZVx8_n@<mHVP#fWx~DzVS(
z?tVeC6MfgKg6~uvQKPHtp~p?G!rTrP7fM25fZ69lbV{)7bCsx?tRsI}ipDy>(8-xA
ziX=N>yU2QAJ^cwDWai)w=r6Nlx!7N9C&t72fn*_!v_GkX^%e9~SM;#_;T<2KA_cyj
z>u;ps-K<BFoj8@m-<Zl)Pf_z*>7mQ{dwFPaulM7_OQT88iCXs$)FYIV+Em<N?|v?o
zghDv?b{Xx7M^+zT67thB)mNB!)?Y;Pd|jDjCPowR7yg6%|4*XB75jVWh-1|JuY!ml
z@#6SkL7eHWGGX>BD&>;Qkt*C*26CsgRKA%PRHlr+{vSyp1RB=%#{GuKn+L8#@=b2-
zP{E8_R6|vh>#L!%U>bPVaQ?qLff()@4ft=KBnfr1YN(8u^Fi8@P8h`GPv3xE_#kcV
zC%SOs(j;43W^;+I+|IwW(qa=bSlo^lwHfw#EtMbVrm3=I7M(Vf?EV#16LGn(7B!bN
z5qqx}rjka8$$MHu<~1f>_CFu>WUUkJnymLQ68Di(g{9oM&XC^Ut5DtsxTUK<OyF<n
zrCWI8NajkIwPYg3)UqaGYJ&|NwXhuN5|@}*{bONfl^xL+^7`Mo+!Sq9Fz2j|yoT;+
zBx!_Y61Fiee;>AmWSmg)4e>CSuC40LY*aLtEbPK?4&NBsC$}z+mIDZ09{9Zj{7AlB
zZ$@Rt)NSc3x0^D}(zF}4IV)SbAUhxC0&1&#>-I~eQ|jKxU2>wu56(H(!jx*B6b!g&
zwa}059^OQAoYxFIEo#hlP~PqtDoJC@`!V$1xo$EhQG)Wxdse|E>8PTZe%h@h3t`#|
z^gazagirCChA0$4vi;=MC4MA7E>u@#&iG6Xl-pH29qrix--pO(OGnbUC)z#JRn=oo
z^a+vdga}KE%=Z-_!t+R(eBaCG{A;TsnAS%+N)|$dr=I_J`V1_7!$SV@_R1?^fpdm>
zDr@GpvYXuQpwi{o6BIKXQOftdZ*N4q9eOG&=FP7t$xf`rd*B*AA4JWi4<7R8uOpjc
zh<>g<^a&NB{#%8Iv3MKw+Y~Peg{g1;VDr@%J$V2Z5qUt9xBvrHcP2_PM6wV=Ibi_T
z^=oiRhN?i$#t@x;Jz|8U5z_OviQn7nD7Q1N#q#vLs>$2+V7l%aC7FnB%MIbyT95y+
zAJ^Up-F7@=yrdD_u76ul6^dt$?Tu)#4~#^+5+ih=!_O4SPV{YK4BvAtd5sU(%EnWh
zOEyN_RrS&(p)kOY81El@Q65iRTjb}!9&5>1U%XYdygFMl7Dwq|f>F-fNkWF(W}>RY
zjLgW8gu*ECW<lORFv2GvI?9&DqVXa(PaW=C6NI?#)?&$693|BhqttMdghpaGVYb<l
zP#8s-FyQP{Y<MR((W}l3Z^>AfvuWX3i%YMg(qje{td@Hiv+VB8QdBVzmnHeJemz+9
z&?m%`;Vet>{gpswgJdi;R>$j0UihK0Hsc_XzZt2+^)pj-VM;@{NEYJEip?;y{7BLD
zYBLqXwKT`>Hh=97Nhr)rv*%{k7K`s1Vv|=%5hWU5GgsAR{IYgScH#hr78qa}ejUpS
zJLez^v>01^pQI6de>xWtX@Ibea1D}&?bTPbTV$!y;{I6R{7x8iL~iV5?9lWUf_iO9
z1KSiQ8Cx>#XPl7i#K4Ly5!l1QqUK~>RV~h~E{cC;{~1XrL^n$#p`3rl*^au(udhBM
zC1V?=rsqY;PAJClh)087VXIo+n4Tl6IXf#=XC|xj70E&jX@wO+dXXY}HB3jHe_A1=
ze(j2g(7hgn>1mSCh|@2}<<jBDZ6dtN3v*_O!p7EE8O`GE6HPyBXwJ+QHHWQHp|>+i
zB#n@Rz~JqQvn>^h3$N(mFlC8oXSCF_w(N0#(t~G`4{>($Y%tEYt75mf-NNkru6fG8
z!!tg$R5B8h*lVwCldiaTEsdkM`VCw_ee7xV1K&#);xH%cW0>C0#9=IK(aZx<D<q9D
zox&xlE8e1{ldx6F8?HsEXt&cA7or&>ze#qY@3*$_J@q}e-cA+5h1;PvTc=h@8X@8X
zsn!qv!-edu3;jfO1-H&l)rWb$L?OLE6rwp&=r_9^meyZH)IxUhmf#gx165ll<#bKS
zL=0qC0|YYh7ayz(SNY4+kaKB|2Ke5iwjdP#n2&IuXxR|Ly0y4|_7F~Qh~h5SH73qZ
z@S5y@8XbEy$+f3ul17+SOv@&UK&-M4II`s5ke-TfimAew+*MYRg)pn`_hvWu$E->)
zD{tAXY%RoS)>(Mza1$F~7hIsSm5jytMK?l7d+UnA&5aO}*U&~p^OV;ZS&JIS#;OL)
zqchDUjSxg!(Btx&PWb-HJ-P<d8Vhz=O}s2*ZJW`{S?=QITZ4(6G4lZYR+0Px@MWYp
z^uW$u=G@>Wn8${0EhJ-c9;=%mMy`h_wCw3+%2o10r(r%s=+YFz;8;;Op{c4q^YLP9
zqB+iM?!<|jTFr3Rc^A=^Xu_I7(|D+;$!ex*$ZXu)j%c3nn%%=ijoSz>`4^h*GvSh6
zNFs;d!;ZrNJqyQYD_JEcC5k@&Bzl=}eimrJIw6sAV|Ld8<3S$yC5vBl=kOgP8Xp_u
zRhRP(fbqb2-6Uf%eG45BsWv{E<h%F2gUW_6In+z75nj>PcZ9}fr>I%8)61A!^%D=K
z`$xtS;VxcSa7+~X9QVSv5}i;B&!7QB*xCug{PU7fmuqzat-6X==NBXpbrG-bQpl}$
zQMqx}&bUdIE*efWF~BvFZ;P7Wx6ua2oZ)Oo+-M@K;DuiAM4_LH%8p67I8oBr@)KU%
qX<<fFTxBC0C=^OrISROSE~;QAsxa;UEKIXpRR-EEetY4u>;C~F)&Z~p

delta 91922
zcmZU6by!qe_b@Ze3=`)}&>=80#1JAN5@LYeU?&L1RS^Y@YYUi|h{d)=y>^RwY_YLX
zuU+W13+%x7*4b<0dw=)y51;+aS$nOuSMTKR-A9sn_Z~^Kty@^j?5)Iq=jNX1I$Ikq
zOP_eW>+B}qWdWm)cO`c-m4S1M+AEwZ;gnYG7a!{y-_e}h+sVo*auAvQJ<KFRkUKwX
zxE5ZLq&aCthygWsCRUeNS<RFagQLWcl>MrqD$I0gD4}1&5BWE_4`18Ket$I-s@$GS
zw9Y`a!JB|Gv((CJnt}{+kkY=|!U*>usZ7LN*w?eZjZB0q-0T}9$KUP+woy>b33Fe6
zRmrTZKB<Xmn_A>dQ<zfSRfb?W5j(80RVSJ_W9vrSy%JtTe1uo5$N{Dr5Xlg=xv;4H
zdI>@d?6{e=JUTCyP{9-)?9u^;a%w~UkGViFK{t1hr=_q-xAij0$id-n%z+9ktI38+
zMij)~CY&n_O%7Ke-u`_o))cnE<_x<p)>c-{?8%^)ys5A##fdR|d*Dztl}W7|vL-`8
zCQaq!!5<R8xvfVlNQf%jP`G47rW9H77(EZ=_srC^=%?Dss!tH<Q3SG&RfqG1AIB&)
zNLDgsBNM{5=@vGe+cV~2i#+pG2hJ^cq5iFLE2?p7BJ@8TPQeS?-?oD`Q7D;JgK4AY
zyvo*~g~Qozua?zet)Rb}vDT>dsjT&Ui-EO%7Cw?wp-8?>cishpjpvADEoTCrTnYpB
zon?agdT1Ad4cZgfKrOejs-~OYCeTDmZNozgwI}TH<F^S#SAF1Pu8tVK1kkV53bW2c
zFj{*ru0gowB@Vq`0<EDO8Fy6z7Jpw?xb~`)fo;BZi&33^e=Pfrd;AA>wLQ6Zh#QdT
z!IMk#G^WCW7nP%XUG_JEoN00D&rSf+Ku5i$u>9jVIkmX>ZOKI=L8v99$779oZb8Lr
zYS$#YrzMc&DJ1&eN>Eyf2H&CE^H;sVLi4RfBM6+IAyJd_F5&9J+usLD&?Q>_e2J}H
zwItQ0^fzIUR5FG&-^wHzXe7l0=H=c4+N~wzZ7YWBFNNeSc3Eg6d4_OPNBkZ92x!{<
zi{=vdaJ6tjC3%CSgd&Y33!z_jZsyVhEV-eIv<VG1iG#2wyFJ1^g%!Lc)f$oO4U%xi
zPcu8oKGw8xltjp=B@6XjC1I`@fOJKYaH8`Hm;cAwgrXXfn~eO@2+09-P$4l&5^Rk~
z@@h#6F=7Z!>PkFpsPPLajU@jt*w#%XF;cA<=At&vKCua$>w<zRh-8?;b>xCixIwtr
zOtPCHS<q6_g7tjbN^%V?LO9Y^QcFPz?(QNvhu=v_S4k@*XX+-AGRl^ol1=D$!u3BS
zAC(k<l%z@8QgcsiR?GAk@NiO2W*^p=h;g~ZnPd+G4R;9)*9e*Ek}Zspkiilaqh4p2
zBpz{+ypfXjNb}i9Np1GsFH>?2LrlQ5k9{YDVzwwrR%a7KA6qTlMrb)+vJOQi;%Gp&
zO_SLCSJ<s5OI#Srfzu?+k%cX@BwgevY|VU04jOxu<?EpLKuE$Y0LI}(HdhbV3m+Cq
zejqc4_?%_qz};@C$RJ0#g@lBK`v@(TOQtg#w^m7hu;%S`5_=RNF|-d3M6nbJ5(TNf
zO`^sDH{T|ijEGm9xpA}!5KmK(9veN`C`D+&eWxUWNq?*$kz<S=;4^BpAMmk8Nsid_
z{|3LfUzA~o#Inu=EU+Q9TT*(`3Ec9RL`RCwNaX+WO)-0Ld)l{|rLwZ>IF%eZ$bscE
zIF<0UNU~T(b$obE&)R07*Fr->8if%6SAArs{go+E-+=qGwj{KtBrrcZ+$eOrD3Qr%
z28g`>bIV?E!~<X;5C26#F08yFxr8aqbzVxuJAmm)OkoD|Zyw|%q*l1CP<&PLomp`4
zb;(g?INJtGqe<~?i5`V{=e8sfW1LXufh3o~UwkMjML#7JzT~Au@n0j6Jdw;s8)!DH
zU9!Q(%IbqXskhe=Wg$S9+RPR7?<Fy)u%V~plV^a{4}w7%^-O92X+w%)!TnZ$lH{}S
z5c5@{!;I_HuC!_rNU;h>47s2+Dru1+@=v{RCt>|JNeUVdsn$eI73TOKT^&F4VeWLG
znvG^60ER<Ss-aQbkr6f^Kr3q+%}I8HaBpFSRBAxdm)6uxe*%%*(;n>$IywrUt)<qO
z(S@HXse)OVl9%pc4tqo=J<FPTw$dw1j+M;WyM_h--|Ff=?1=-k7d|>jH%O@=Oxsa6
zd=_w@!y%92XX4TnJnWsbbRi=&%T3x$#Wd<IwMJ=6wTnLWfSz*sZ~L<+vCSX|`1?u!
zL<!^!$xllKn4bS1%(DP#7Q>zzB)!0f(W4!hPj-uN85tcSRimR!4Urx}+Hs-M>i?fj
z{7-9QKjCPEv<5SbZ*`>Mm?Y5ILgK>h|6^t(yPot8l1Z*FJ&k078b}>*h?9z+MKuBq
z4c6ljMGT~WYp|vK#?p9Z*w0%?huUC%0rs0x9AtAl>1f1OwY`*MNu+isX}SWbZB3N6
zr1I}2i|YIfY^1I7w-g~;v=3Jb^}0))n1m9NrT?JK{Ly~fmIMILkczcMhzQA(v<{F#
z=k=8KVgB{AmvldC9{fYvLWU%%s}*z#SCE$dD`nfYzw{y-bgC45Qj9%<Ax*mYe|?%`
z@1KLECDf<`y65jW1uFfjpsAh|1WDy2xofyqa2+bW#88wClV-ADJ_xD>W;-J%NgZS~
zM@}I5K|`UQ)w_swE*X;mBK>=+G!x72g1gr~@*0S2RfwUpnNA4u6T?+PUY^tm%T8hA
zVrh5G@93Aty;4UKSHQ+4Bos(n@k|q&rO$9&3%5vrA%A4acIk5b9J52Z6kDY`q%W~`
zu~7N}yPVo7{VJu^RnyZGK81iQ$CC%yyqQpX`d;A%;<!f|O$Bp5>|<;+Ky<buZ9K*J
zFR}z63bJ^Qv?E1g*kU?!5MuI55-Nn41%P<L;->bXbQzM|c2HVfLWR_LM*V;r06j^K
zF<L|@SPo0yFr#)kChfqBm0XH&G5#5-<njF0fo76?NopjzQ&I)-=nJ|2!ZGPPboHK-
zTAj9mN?xc*i(3&+064Rn^7bY4ZW9+^--Xm`1OtSxadcP@dYx`R7c6e5HKx~$9a25<
zTyCuveq55SV`jGgiZt}BEFChOaBrD){4vbqp*>}K1uxb_TZT3~{RTKJBP)CqUZnh_
z)NyT2KN*W|(^F*<hA3}<Ov9SX2g<B55jj_+p4<hjJtlAMDrH|fP-a8kq{)s`CcSFD
zci0Z1D&WZWW@5dwS87A<4GgyzzNgFfGcx;!$Q&^&U2J*N$pGdz%t@TnSOmo|nLnD|
z?W#B5mI9+UY{=Vexp}TJSuXS*Bl}S4h*`29EV?A-$gZ;p5H(Xa1(U9or_uEp<SL7o
zI2E;!+#H#W@O`$}NmP{ZW1;LLhL-t*a_ip$WgH2o9zX%BEt9=rRfuuDYzAX{+a_5o
z7KJ4HsVtOixhPYTphB4%g<PvpHjP1^-YtW`CCYnIg1l7`WVOc}Iij@3DK+%dlg)pE
zA64BaJA)1`d^;q|V!3+kMcG>RtKZ+UJPcIAfm^cMta@5}U-p0nI$)#$^N^>qI7Z&(
zr7VVddcp@;E1XCQn?K2la5g9m`XXB?q4w}k`S98)qL|5ci^)v3adNHD`kQPd>vR6Q
zY^8=OR_*XjR|tw7;Vi~G3rV(>y9$St@?$8)JMW7}j|4mr1flkeAnCSpjZnhLb5YM{
zig#tE!RH|UU%fZ$<S@yl>Ik-#*I=`RzK(Jo)9fgBc}HY~NS2!e$u@8Kzw0wmzR%XT
zKJN+g_0Xcdi6G=hcDSR^*(mqN7*}19@=oOKsDiBUvzP>cf$~G_Ei|{tui~UFrNq<Y
z5Ad!W@}QbLSm+icAIZ%BMm70X)|?(9AIk#Qp+Hk0$&Qt)NNkv#*-BiP+?uD-s;H3e
z+XP1Su)1iy0m9T+`N7JvwuSr#MzVyVtFOmup{6E1Zps6Nu5t1<TGZv%1o>D-GNija
z3q2ye&D0ZL!K6c-i7CX_M2QPed&@OSO46M6<!K&}JZBL}1`2@#<pqr7#o=;$W)DxZ
z<(DvWlLIs4YiQ3^-kIjvAj1L;GP`J|d?hQjV&}*UD9q3aNn1j}QGM+w3`w3IE*ISA
z%ImNasry2CHJtkhc`M|%8R7Izaup*yW|#aJTGA0um&o&+m6cyDBscts{2qtn?s7q1
z!Mb+7DW8ebAZd|`5R&vvuKZ8UAOt^?E0~ySK9`p>od&&<pJnmj`&;>T<}=$r$!B9|
zkpBJps|MtFIGC6oDtt+N9t>r4h1|@nZiQ5Fh4C{&rx=TlCVX{NEN0qS;G>wx3JIgX
z;s<j&Q;?!Dqm>$}h{0dUmKur+IJWF?#S69=0|Tm0r8?sBNSl_nFtHg+SBOaVLa688
zMk>14VxPF?ii2!`x|WK>TKH>5j$$hN^~V&&_gXBX9ak)1zb-tf80ChrP7f99Fm?xC
z*yMI0%F5~@2~D>NCh-pyHl%O5jg#Q=NO3d@q3>6*Ntfuv+G9vJm&pf1t*jao37PGn
zGLpQ-z}hSin=_&4If<=pE--SzZEcz}a$^VDtYyFYrQ2v~BHGAxHfikF_3LfEgwnAm
zru1*q3&uVK$1arIwi$0rzXa4D6fgo@QLPXaR87czV>5$w_b9g+T8jeZ^tG&w0U$>i
z0$m7HwnNrCa&b4bwXLicTT3bHLhB%9P;KO7T#7PWB1?p|P_nPTGD<79A|e-7aTZgS
zowzP(BI{N`^oUGVnyo3IkeH^7mSc1A5T%zj?Ua4|kC$H|rmW;?+!0<6Q+hCX-6$ml
zb;D#_XJy){lBKN8)=3h^D-D%#c!Cm^IA|maouo`)20S}Q32RjJS29Bhl^ksjn57)W
znlEN6tFmb6ny+l`jt0=LP+6B{#|^ub$8qK6dSuLDC5TBbMGGtCRBEz*6O<!61f?yb
zUA#y6#gi7n@;kLYw}palbQmN<SXdLGH*F5*1;tHef`)?pb)|c`1x7wl^*@l1Eda9a
zkusDqc=w6&sV%Zy{8b4aLPv1=hq9|2d9_%pwy<AsDpV;r70&hTB=-VdIVp|PB6W>0
zMxzSHp}ama>E>qm9A{0Z;sSiWg!P%?Y?tqoAUI{)VSq2ys#Yo}vzoz2KiNX8xfF!M
zFWM3AAe?kiRpIF8#joG3x&|mE^4}EIq-7ywW^c2qlu1YFt-7p2w(kY0rm^OLNR<am
z(z&%%&Dpr_#H#w(QPd+|WS;B7TUoV<5(^L%r7*g^Y7md8%6q6rao9{tQ$?~oxIRM#
zNlQ$L`)(gS?+R9VRry=Yg62<EC^Mqi5h`073fOb)-Ap;?BUAA|!0B14Dh%VsajHc)
z5YlXds))J);U}twYR`a${xSP=yVuTER&|nL(rtv)I8h~cL9X^LRVC=K>9SeXnrUm`
zHdWPXsLrMLRL8w23k{yv9O@4VWVVt{6NEgex<0edYYO!Uds+$I8y#PNKge2RO-T^|
zQj&Xz!WF`FH}wfr@zX;Cj<g1trc#>Xge-S;DtbP7<DnKPUAlQe3PK<{-11ao?IkX6
z#~@dMM)fyLA;dpWos3;(2C8H6^QAy_0agX~cO)<I0j6`^iR81ZNdSzk*<?%H9wFKB
z>VNN#h&pL{IpE*{D1RLxV;6x=MG(?ET>Upv(RFj9-E2r&gxa1?2nHPW{OAq?x<K0G
zYrIH7eYKvnH~~|%k_a^{($iU9MWp&qto%2O8~HH`be>GMJ=J&#0Nov{NPV-Vf8GJY
zttC{)K$p@`le&%7uaG#bJG4N2W1Fahm}PEhp<adcS@v6t_iV6|)(TpO3V;GuXsKJ)
zZL5|t3xUKGNXomac`~LQh?cT9t(`j3hKAgKx*Ys70VI|RHQq}o|1;XDIpIfFbs9^1
z=}GFNI63~%XmH~t06AGFx~iG9?xS`ThWAjzCW=UMe1Eq$2f$xdFs&4QBe{p#N$Ar@
zeTo$mp##;&SO)(xSnb33H2tZb=|CBIaizSU8ql|Z`PeQ%FET<}U4W&MZ`0IlVryzw
zFy%6cY_0@dn~ti*EcHViRcM|%$(l~LTG@5nrUl7t759gNg>!S&UQ&a|C3OD%&y&o~
z)R_NUe}v?#yWr$jn7K;r&&05At-6a+HfRErmrmQ%hnOmE?o_v6^@HCrb$3>4Ogp1K
z#A0^K^XfR9T?osrsB5x-a{Gq5myJ$jC-IU~U40^~*u&C(k35)JmS51w$c1YV%Ey$d
zT@z86i|S~C*@WSI1I=U>!a6n5v}D|^ZKk=#Dwc7rG<z99-FVFiHBKA%b<tE9(WcNG
zE@*f<aHuWepJ)^0*C<VYq&IN1W&+M6^7rbWgn_ARNy$7-Fp0ef^KH+&P+CQf)igtp
zIpYs^t_vWcq+kW-OZ0bP1L1O}rYU2>BU=MY@zf><j?-kaX4!a6I}Gz)0|rS0p!*8Q
z>*A^+2!I#qXh{G2;W}Z#L`@CWr(}`_c1|d!dO4bfta)#Wrm{GhI!&_;e<gKhXdcq3
zkW}`(a2|~C5ZS&RxXZ}Vs7T1ea1|+f7%nF%Gd0UlDw<r)Q)<RHo*#}8po1$YL|m4p
z{3OrTs7cf;%@riC%hNbWR3ax~-IlluBz}&jH5ofwa~eN4pQG_)+zp(o>ByR==4n{@
zPnzayzEGRro|cuA4#Z|#iOW+yBzA$uo#YYCSVUx4pt;QA$<u`zw!TY7F4kP4_D<rK
zXf~qd|5&1_!|2pqrfJEXE4M(SW!6`)M$?p4udmi?{-LsIyKD8Wy+A#b=yQ2rV%!A8
zJvVADAn~^wH9V@w$*zxC)bMKQZ;@z8LK$eM`DRUfgy|WQ7V{KveTT}Dsgc6CkPlll
zZxQ43t(xYHv&bD9SWct5Tv4cL#F{U5f>q$J^>%B55wmLZQ}@GwIYC0#RX`sOBx#>U
zk^es2k(3CPLSMS)w{dyy(~Mzgmhaa@vgWM=zYWp<kftm9HRtee`#M~tF`!AGIeFsp
zRUml)1yl4n+>Znv)!awcw8u295iYUK?xF<%_nH{mfB*r`ooqR-*?@2fCo~yMdnG3|
z9az)9SkndL>4C5Q56hwVQVDK&!pI!S(i0lJaQ(DKg)p7>`n!n9DM5m(^S@z8_*qQ}
z(ovn$)MTS+d|oq}(K&HJ(~UK~OEf90Is1}^Z6gZ@E^D$`x7fcm5v)1ts-_=nKDnkz
zVBH$u(D<@nGj56`sOHLUX;`hc*n3*`F<=v%hHfQNHbZV}bVMVM&=T`q&3WovL)SNY
z{t)1<Gb>5Huc<<!?`b~MuPI4Ke8jI)(e(uQ>cB{iEY&13UT!_mc(Gq4|A3>>t~c9^
zI&lWNc0wb8o(7WqNb><1?fh6%pLJRGL?dU2);`s2wWi^bbbYB=jDhm&OHCjH^?R*R
z8K|319@+O^kP_BG9K;;yLppO22I0@2nn*S|-lpJSBbmzNhlVTn#sceY2&cvM^!H=0
z`1}P6-6^hEC4qQkBvl)s7HUUuTUb(jQ;XwS*7#M2+m7@9!Pe)m?uX)GDqS$6(MU$V
z=ps0wtTA_jh5S9uxe2T=jA_Mf)*vV4ow;N?Dx5IErTDvtmDRkVI2RJh=i5fu3iAhW
z3QvSyKAoG-$}XB9V&_F@c{p$KfAd2yF&^erWOqJS9V5`IeC{3{ExACr6|`<lukdz=
z0ae6Ukrg9hA3x7N!kZ-5<Mrh70<IYXdoJX5p^$DY<oqRSu?`D}>{SY@CW#IB>O{YY
zQxla_gih$Vm{X%GY;l(FYyeT{4kQXV$3j9DbG)!*373rO+P#dM&crvXfRnN9X0n3Y
zi`|D9>_4}J?#|XEw2Z4pg4c33WS}cZa``H*2f{>;Y~POn>o=@O_S@>F{0*FzgskNh
zBAgrPzM742+X@HubO1YQMV<2hfZ37@7da=Ax`vxV3laIvgpT<D8$`y{=7UA5dr)#O
z*K)tG^=2LS4~5+^|B@z%!q#hSG6UK+^yfaEZiiKapR0)}ES${Uz{O&Xvty4j5`K@w
z-yJq`&FJsdceI_@5Pt7K7#e%BVk3te&gAPx?g5g$w~5QaR?22>Hnx6l<_NYHZsFEZ
zZ1-pR&5H$WvH71}p(xwRrBZ_-_qTHSO0jN;^9z|wG7odnMA*h9V{f<Z+*E9>+s@@-
ztM(3V47LvJ;HG1%MIkqeBI!Bvusbmp0g_zb2o=%q<T@k7fSufA%17#xn_FuDA2kRB
z>0na29n?}|7Z;CUIlH*=*y49{e^Hdajy?8}1c9qfKJMm{@MC9zV;d6{0ylx;J!ux=
zC|+@VAagrXw1;bn;QGDXC~W2K<+3SOSNoF%Wcfa>7HPAOYm6Tk@8c#>T=jF#gj;||
z3+W+%rt*SdD5?9oPy{R3&rQUZ-vMqa?Xc4P`4)Dds(pWe!^IU6a*(mt^4!!R;eaxn
zDwC8R;Oxn#gIps-6my8%f~~iQxJl?+qYiVp*FmZjRgU&>5x12Rob2K_08NY%+;)VE
zp~Jem=w@%?xgF#mdX&Q{2gyFl<<XCeZqAAz7f--POL&Bi^f<=VqrcDha;ZyVcfjwD
zj&XHqq`Xu1*Dh+SR%H8AjStDI5uqnB5g>=O<6IQ&^>zBVb4cphajpgZX)jk}DcK}a
zkB^K{lCdYa<~XXeC%9Z}C7$FqU`um~JBO{?r#LqCU0%#Bz~5?}=H6i|^bA)*$9!N-
zgOvk7goUJs7A&WAtq3Rby%x|(ILi&AT#r7~>b|%Gk^<>#249UB&vP8P+ljX!rgPje
zicWZTj@yH4Zj<64UdaOJZDhL?@KNB_bt4oc%N54?`2y3eu=v@Y2>`ZBEDrt$rYGIk
za6EZ@k;_KJDJ5JrmXQ`;;uJ_@`ey63XQA^T@?aqsOti5PcBK3=cNM#wzQV~7E;_5>
ztcw7*^f#QE*k9!yBHXvDoHs_c>6be6SqE_KSREk-Pm+3rb0&kYa|fvqzW=@EQxSmV
z{0HPU;!3#5Rn`;xZ*ew=HhpU7z@5<N4<<O>T~0$(cepdy=lUH^$71q}dt5!1SA$Er
zUJ^QymP#|~h{5rdxbqZD4nKt6Hy>~t=*Q;Q*4NqyAAQ6FS5Tdbi3cDMHJSMjhm4WT
zN8AHCfH?k<Q4bN`2Flb%V0&VZ{HL5uSpJyvVG=HW!gWH)wQ^pxatxsVi`4VyLrLN@
zZV_dgR6OI}VC(gBt{8o4?+fl0wWTY~#<c7Jy%NQk8%!>~<OU*a>?`gl<)YS<(7s|J
ztdY_=IVorr;YnJ*=HAk;*G3E)BHGSHEAn<B@E8&o;Y$|1sf?DEGVUv-bLXi{A;$2=
z7<J!pE~H;MhufT%tCtUshHo##z0zP(5*NV{#anJJGO`p}6vfu^%snF52eiMCuOQq*
zZ6bIw@m(c3!+Y*NWvgaj?&fTO)xo@4%-Q-j5iX>lU4)#}`M?z*;s+nN7udS^k-Ltq
z4WGE}*h>1$t-+S+3-=Op`<E{qOxWmpL&`S}rg5~nqJq<-3T7Rhy|Fb=Z7qiM81myg
z*Mo9n>}HqR27YftXGPUWtOSaO4L`VEs0_nTZXJ?3_>-%^)`4H#R@$1M&d<l6loHkw
zR!DejBu>^z`6Kvwpf%4@fnLV5&~iw|&%m#7avpc?iAlku>RR0!pDG%wEtZ(ZqzDta
zmIUE#sDd|;2{!yJD)y6O@0}kCU<Ih1<m3of(oe}}BHkBD9)p*#PsRJ87&b)B<YtJ7
zNo!9&npEi-;YeaMd_Io$riOn`>#Cpf4;NPf*i%+JXS@OLZP5z?vNO+bM)aq7o@Lg~
zT7Ej^XUy3q-@8M%*`l3ClK>q*1YuX`_`6ixsy_#<6;C$4v?4tk@HI%09%$1QfvyI|
zEeWvY@8NHp9e)E`&+PcKltlUDS;Zp&!7DK!Mw4&${2YYHbKr}ymEg#;0R6?0Urc-L
zUggul2YUHnuUk%h48poQ^T#QKdHDC8V&L9LLJxqmQ498SsTAb~7k(Ue2zKSO=}0=X
zThWxJ(@O6e9N|EI4FEI#?#kCe;5u$RZff_7uq^olu&=Tb5AlI34hFfmNQXe1ScOkV
z;F2o*Ly9?l@U4%10eHv%Fgub8_eu`3J$OtLq=_dJl;CgtZFM0go;i%h%;Wal?H|kX
z=K?|(IMS)*YmzT!9_y{?jpM3|+V~(I6bL6JevpNpd-3y-p>f{)K5WJK@Sm|2?aTkb
zRxJyUUO?Vhc--PUIDLv+Q$W|0`BH{IuO&D9_yp_`<<B$szT?lI$KN&w@DHi{oj=Z;
zBeL>DOq|iAJb>rPKY{!-1RGS9|6vf##Ch$nh;aaB3$eE|srQbHAO$tRB@Tt~3UV%p
zUr6DFlxjRw<<zbQ1@j;A$Nm6=UQpV1*jF(!sz`cAh-$H!5kcfu5br|Xh4LBLtxt9S
z7q(Ku_;N&$Sc8`!0`iw+_b5Q{j?`NK2tvk0IFlXWd>+-`*TY>Wi^1@t6}hy%IxO;z
zjqoA)5tW8oHIjdTm<>_PE5f}y&lHWi42yj1F?vFThP0~5k5`G~m{oN7d?rZ`G5L~^
zNiadsP6P`Ij^?Kzs{PUYV=BkIphhdikno&_g#3vxBW^JXg7&9c%$9`_wJT*KWY^(C
zaX`IeuRakcQceD4{&_LHjC`-lw?Y!FVt5Sp%CztA|K&^orX<0!d?dz@{;~W@Z0YOq
zb0|-_dGe0p@Ro|HH=I1G$FnKG$ol*(6m6#lJS<~Tg^g;+@1vF4?I*n&lgx&EeUjOT
zA4)Nub1k?)3up{@8I8bTPdDauq+w&eK6anhm?so=`S=|f2n)s?L3%ggJ0QftCOnqV
z9kx%{B_0F{`4_hDO?VB-Ys%yH>P73s_rU-WTxroF!T3B#Y(UU~L^k97D6J0*o-L>j
zaMdyYQN94ofsAef##!8q52PbH@K4|V<VX{~E~(j^Z-CrnH-|hR&Pj>B1;3a6CY*1<
zb4(g{TUI*Uy;hY0a!XvL`((xQhbaRo%G^t-z`%N}ZerVj`;3_nb?2<s{GUij*@nM_
zt-EdbgGgvuTORgysMu?_=fB~P(H$ULV6#t0-kUXZJMpksK|v38<}vG%O<gLTYjsz?
z8`fzfuiSfk9#~0b`MsnoUzLO`0bY6~u+SvbP2?*#FH*Ymw^^~)G?~AN=IA-*`Pv-7
zu?NfO>|{_%zn=VRq;KfO-=m9)s~^UH5DjZBj{0YBu*9(5kQ_yM+QwaO{Qy1=B6)|Q
zri<;xtA$^^c{LIvd>{T9&Mv?9;jdB0J-;d7?KSAJm{pkZE1`Nf_v1_H*G*S~A8&`R
z8u}Gxg=@eJvQ|OBZ<E5b@L<Q{$2$Q;E3V~*{Dv@))c*V-lxR>Y--U^B+5nzSo~jN6
zUZ`lD)A%)%`{IG#&pH9BKE#y9*CWYmfQ})Z??JWQ|CZ+qG0zUfk45P`R)#`o1|P-r
zHE<B$5QS-4JzXX)y-pz^WsutR8zb!Uhwv9@qQA5J^a2UQQeW6H6^pUb!Mss$A6n^*
zLhPS>8-{THFup2k*s)&Yx>o?<G_2l*^5HzsfFvXN25fjuNAbl}aLcxJN$d~c>#WG(
z=5DZAlg&Gl?^{5&9-AX<N#<yND1}Rlh>IKxaB8+j`9HY0G5jnP--9tcJMHTg-X>n$
zo=LMN869CBl|70#l9rkLa};FFEFPD=-}dOSg;uw;;;2US<G^vpXY*|l?nXBA1+s7)
z{|J8@H=h5B<T5AluW@yUq)!4Ri*{wVb$d5)9ep0g{_>p>YBF>(KNp=?lf&OZEH85)
zgHXqQGll<$O1oN(qZ`Hb{y>R%t{momyCIVMO@lH~gcDv*<2&MHWx&S7r;`A1r>Ni%
zl0Bo+^!;a6MogPr-ku5Fa~5a@iN@zuI%Lz?ze|GnIeZyqJv7r2ElT(-`b+j~UQM$1
zLz<a2m(M|7UFY#QBN4dyU_g}Zf3A*57rpTSOK5aLm$3ly<?4KXDb?NUAKu}U0mK2E
zL%iSdKalc+;3j_&J`eFnE#RNh9uW?$w~Hmw5nTWJ-9tu_7V<OcXx5f5<k5sT^gF+>
z2!Nf%eV<@bbunLxONsg={0e8f@HmWnCH2w3@iHUM$YPd2MUZd=Rvy3a<b!dYQaqy`
zC{mJ>(j!n!rx)?K4k?T+;`MC1VD?eI8rr#R{N;j<u(4Ml!v*pGb)0uB!cFiv&L>Oh
zSO#Wn+&l&V{S@TE9oYGy$N|PlxOtMF!)B%Fr};H(d(x$Z->slrAE@hO+6C*LgWYhU
zv2-UdCudHG*st@4)o6te9zYR*&Au=B6C5@}zVHo|w7F}9_B3wNs>23(j|VwzW_vuP
zryy*Mlxf#dHY=WY*mxF}mNz()BWYS+lKce{d_^&cC{?b#s-z5k9?*2H5?}^+i6-hp
zL=3_Rtu|DNHoDnS8;Dzv<c_<REr}F(Xirj3NPkc5UzAp>vSD>w!D4c`N<5>*7L)zY
zMQ}nZqqY-UOkU@u{f<Kr)>YN+#T#uC>z%SXZv%VSM%-x5i`0Iq-0Y0e4rE6jDq3hW
zv?z%5uG%t`rtFvV>+f(_W2!Go6S{J6vQ(=OmiN%w^Jp?cnzo89_Pn00ozH&F$kiUf
zz2%D=N0ik9l^7WP<hk13%u0%vYW-d5@N`oIYcZ;Hp;5($6kLWvtNI>oPei<6j~325
zh!I_QwO_lNr=k^7CM|LU#Lr!DwoX^(5C09iE<dgHVY+ttTf3h%Z`{+G*`DjqhuTGQ
znGZV=6i7<#^$K$SnU?W&`<XVKS#R7+Eu4!HA$ON)Ra(q&{15F^Y%Ths?S*2H{M25Q
z(k$AXo95;Ls+vbG1?c<)xkMMj&T7<@>EKcn-D@x^bi3KFw^TYKqxMsygL@4WK&jJB
zs`Ma(t_y3vvD4+UgB;n;x+&~e&P|tNjTzkh*;c&L`GpvCK)cvOSB=4ZGwNWAf}+-V
z>0U85D!g^aq*R)R(z>`Fgq}r8G_dRkU|U};I=E3mI~51$D$nyB2+}QOzb1z0)?$+=
z!*qD!gTPsTDpCzOAEx{F89|DOgy|G+h;v1pZkhu&gL~_Sv7@|6OAHa@et(@aeyQlM
zd+8wSE9wfa2nCYlfJnW#yI~LJVr3<w6z?!ZlLA@f|1Phw4_V`MZRqGC%KcAX083sF
zi+2f(FC(f5ZsT<iI7(#qEZsLYqMx&M%IZj2eO%WNZFs@V&ckAKaK0gd9bkv<B=va&
zFRUrn#pw|I%XMA9+O#QNOba9>{(2WD#;B4qdOqxGgAou@|EeXPaDn5q(FQficpV`V
zzS!%hv%Q}jquy0Tna*22&@{$kW!3jPF?G;K2~Yj?b1g_KvyHwUYMe?z{7+ofGK-e%
zM9R|i@=6gQWx=U~ek4b+dV1)4?*>sAYmhcYo{Wi2Wf6|T^B(#ll!vf6O&=CY!T)(y
zJ|r9LqBpy{MG4$0hfTD|0{t|mdr(=7U4sq{>;3D74OO<&Dt)F3Wdgd0Az2@xZY|xU
zXO8w}lb&<L5&b->heLX_)45an!;P>xi#J5W(LFd6D4aJKnwT&KIENXE@f?ozo-L2V
z!Pkbu()LEr!2I_Rd;=p5EzyZk55TcKY5fHv_u*&*G9#ReHuOXc_6g(be0PB@uREfU
z!$m;7Fs`maffpl%>75OStYsa=8){Um6x)mh0|rY%M|C&BfCnYZ^-UhQLitxHh0Q{^
z=s<ztt5S$gG_bP}L=+Y&sECl;{qH)GIPNAtUGMplDy#Hh%M}p*dau&k3SLPD4<<l*
zFE68?!4HM7q@SS>H&Cmziis1c9+0AS0*r^Sr2lV%l<(F949@?$o+T0-^TPW}Hkjk2
zhG>rZiU^JH#{fe<v&3J+4JiiH!+@!VGCV&hvLA;(ZeL<>Vzz&7qhW`f+KX$-**Rrk
zFB&*s4|o0M>myyrT}h-)xVqi&RZ0)e)b3}wJP(TTc8{cU-|jVvAcEsA!-h^2LG0i<
zmtqZY!LOLCXy*_@{hE|7F~|iaZ`%p^K~=;C*#37&tFo&vwSgfyMB<Q&<dIt9CW}<<
z_Ovx<DE_(AI`43Yj6Bd)JnaxpR?8yoguVf`7EIXA{ZpowSA_$L6=)AN<&k>fL3P`2
zRdLYwlWb?8o&ILRfo$z<%ko}PZ(BGzNX@vSkL@JPRDrFEl}3<B6L!}N7<V8xN<jSg
zP}_ii`4CB-WUKhUF~hgpvo}$YzGd#z!bwP9TMhA4McNDNhTFmgeM;(0rY#&YqD|8z
z+uoR_gsF3F?U{G2SZ|xme1cl6k&jf8mYZ#vA9merTa^u9+CEzw3_p*SYi<p3f=hUL
zRFgu{G27KnNF?R9Z5)H&|ID^PkH02;wJk>>F6^MG(n4-!)!$9L2}S)tC(M-E^+Fp7
zzLY)E7rb_dnp~>o6hg}NkqV)P(yltoef>3d&)pGcCx5$njLEVvI}Hm@KV$6NnB}?0
z+ri@mbm(#I?KmYiyC>KMvTif`*v<B(nm>7C$&F9I)l^$z=x3ynPc9_Pw9_%srO&l{
z!uoevV%G>yNd~W4tn>$SiKXRB2)XDGDJL%-BQ?USZFXTe+*i+c_3Q?Sd+2buMGl~z
zo_p=maJY$G2S1+<r|Y&lh=mHG5bO@y6)?KO3A<=l%HfuAJ%+mKAweY3f=e)#+BIat
z?N@HMh3VDxm)$}Z#{{K)O|-Emdu!ZKf{op=###-!>V;rOdo!kY^(@(`IS}<H2N6@T
zeTXp2*?tA)gTjgZ%8vkdmmJBF6Luj&nveZ8##qy;_O(#9zhj?{Bsa2Gku#z8OuRQj
z?OE-;<E)o=7?3DxL+wQjeh4jGjk0fr#OBSvGGrx4GZ$tR5U<4yC>H?ZNhbGzIQuM?
z1va;}Kgp_;;qC2xm?0PRx4+HQot0s4aG+xA*Pxok%3WOEAf{FN5aGfkdv|6(Wh?E=
z(422<)eJuf<>EZplxhSQp(uMZy&`$Rf3y85HOl{<b)T~`&=gN*hk}%<?Xkbd*wG)f
zw`OtCso1^(6+djfceVt=jiD;Sx4vX=uaUWn>8jry`yGC?Y!N)<4xgF6TyzdGE@+|&
zeh!oDY0=hTu1Cjv;0`xHwi>gbuj3$NM4jt7_@fQ|y~%U=FA#3~>Zqlx#txMQIBDlF
zfpO-M<S>Fo>_r(4F5XB<nCB4A<XgVPp*ofjRHOM_99{lXC=5UD5Z(;?ZQ~s)?6JAk
z?0ASd+@R`?ML2qoNwMAMKxlerM;B&=<XVmwY^WRup6IvHAI@6eX-`5Yx&)G<yN+5C
z6clMg^udt|(mf<nDNJkN7{+4Ak=BlShVntuvh`pe7aTD7Cm(lElj)(LB;PKMg=`Ra
zdpLe!%-kI0_<;R-Xq00gCI-LBjw6tfZpFH}yTG{ZoiQOB!y~<fcQYM}nS~#k=Xi~o
z(}G2g@0sjoZF2mJMViRHj^3=9bi(mDQ)A1Ej<uLGPQU9YWgN_Y>^PC(b$R8u3+GVP
z77yQ916;bIIUeB2wstZI+V_qP^i)#u^#;wbwS%?XM$9j*olJQ0XX=NfA7g=aXXcl0
z9G&JM%AsG}hPQ-kN>l5T6=Axn#5hhT6ArmMC2OfU%{Rw<?g$;>1YAEOEEU#{v=O>j
zbsET6yI9@n17rA9EvKf4(x+Z7mk20HG)9lux{*eqdwr+AY?P;4J56M1txY?p?kL<f
zc8y!S079<Lq=&y<2ze6|2~&oyP9GShw>_K|u_!TXfKyYZoZG{l;#srW1gDw|Hg&4g
zV=Ty`4;`&>0&4A(4bjqbXFA<t+8(gfsVnPxW}TC*hb(X^R{zXkgHAblvT)h@x>KGb
zmGIuDzxK_50&imnjK0eSXSv|@#%YBM?JgACI43Z*`Wc*CFq0ha;k=AV$;IOQk<*5W
zhx1<??VA{3WmS3&Lr9+BY%jE_;hY|!y#c3-H@@%S5EE!+)qVH>K9iDWVC$Mx=VpQE
zNuBeZkFqT4vBfzFhfl`sbY7}G3OJ@6h`M$UYPnE&v4>u|7eN2tT0}YuBX>E&Qz2ra
zo7C}}aU6vD1#+=O2@zmK1sLU>;@^G01Mz65KMjP0j&=M#=MPM3qfR=*V-7S~W|cS(
zHd1tLr(5s53PG=cr)&$xx6TzT{&;?JwsoOEw=VsN>;}&++^vR@E?wu6hEl^@#zpNS
z9shM2o014|agm})9?MGn;}%>k8byp9^i@el`$&##iFV<H2^N<p>|V;iXqQW9jQ6U1
za4L4Nvf9^$T-piue?;75V_OIUdG%Z_W60m+`K4hXV03PPH7~%JNS$jOCmibJ62VY@
zNpWe6fxT|x-QclcRD2XY9fl~!w1>iT$xxRZW?q(wE=!p?7Eg1TjO$=k&V770!V${R
zwpdY8-s^OP{J(6K%Xl=gNi`OY6$^{{TFh#dFe)K!pNpEI98=`7&y2E<{>SA84mG-N
z=^7Up>W<Dh8KXUycZTvI^@WQU&W&+CUE<?vM@oLVF!kO0<)UO$=NI;_@D!6Mam>;b
zGQJy>DFL3Y3_QxyH3ca?o7ivg8sODWizNm?mtcwsby+g94n(R?tuY2rJQ)cf_Od|N
zCd{v*Yr9IAi^Mc@&1dLuwsVa{LD#Rbb@B-~uNX5~^tM1!&^^*Y=-%D+q6ZE=bgb)g
z4D|_48z#Jj?2+Y(g3V5hg#E*@u1djmvTFq^Y+Z6)M>98Eknj4A`B3gr*F^Sf*%?<h
zMzll7>S|JY+m$16S1SjfaMiU1Q^LmEuGLxd!2?%YCg3fns=|T=tfl|gg4=v{J&dug
zZnv2mO;9}?brrAELiGU9Lh>)ya3+C;Dz_ahPsBUAmDy6KdG46qw>|{-glKWW6RMrw
zkxoLczgssu6vpe?ZpWE0cQtoQK?w$LZa!fGC}^l2A6gQZ_={3tu8`TmZ6>S6bluz<
zBZ@BF+~lYlT#TjTKc5<@`R`D+rMj(02F2@}pqLGA{O|c7E|>gXO0c4DPf-wZdqicW
z{bq!lK}*M#P$JWZK*UJr#Sk4RMCG_O$AyxH@*zi-f%>yD#V`>l?4Rzo+k^JmRO?-p
z9#Fuf*P(sL>ogeBz71}vOltlD@PlKDzUbB$l|<k%2v|tD>Sm&uEzEDwiFS~1UXjpa
zYIr2zn%j5=M^}dE*b4@O;g{ZVV|saW$4!d%7CAUW^9HJ{9vV!<mC&3J{@Cpcn|Xvj
zcY|9aG<tP^?N*NSRrBofdV9ZHS=IB^6VoO80J3c;B)YlfZYRFesW?e^UF9uyYV)Sb
za5m36_qIwr{^<FxN|+s8=3JlH(zhcdkhkT(OJp@k8xE3KEO$?fKybdcdk~txIM)s*
zzlKNZ{;SQr2JWNLh<;b*YEm#VQum)~F1eHYMYP5LDmVL2cUzLt-<`=Rv%mWlw7=Jb
z59}3h9188k5)9sT84FXz_|XuO*8J%XR}Mwtd>-&~<pc;@GoG+eF&fH+HsjpkI*|Ce
zTR;2xRl$Vs*y1onT7oBWZ^=rn^d;^+F&f=mWmVh-+^Qn@H?b;$?<)6RjIEiQ-KVln
zu7&Q0nU?PEcF$mnr?y<qd*F>2aZQjdc#(tdvzR>vTysxm@g(=I`$(33H6Pt~V~AgO
z-Tgr;2z;S6F#!U)AuQbW_3&8Ae2(g(G}Xg~#7z_@&)y!4$FANU4_V<D8{zQ>1}5~z
z<Qb6;|5e@Rx*iQ!I<DK?Bc7EWv)g&JWB8sWddy)1&Ftaf&wkyQ>d_80*<$EyMKPq}
z4lcNs2}ImT?$k)N&^FTpE=E%8^_$}Hol)_d=W!BeUv{|}lUst{&bJlwR4B=q5vd{Z
z(<8M);}sqqncWwx_IQI@9``J;*JDuQHb<;QfKXN8`F4*orja{>$2+Xq28}sot_~Fm
zyfX+BFtC}Kk<NnW0gu|OW=uNdk;%sKpxC2D4dgN8hsO^l=TNO@t_}5Z<<%>lsz6#;
zPfYhct5X&Qm7`}cQ($d(&kBaLmeI3P1&INk16ek_5aZbmZ9@oZ?D?E^bB*)#!3ZUr
z)Ns^v!0+RNfn1cHFtx4cYK-&8Z!KLaKB=_Pl}KVdB1q|K4}~zItEY(}TH3?2n3W4D
zgFHDV?UAEAOPJp2;&+C`Xh)_`@ML3NIKgwhnR<rjr|VM%(7;7_HYdg-lw?bc8ezs7
z&r?kJoi=(_ZjKz@<vEe{Z&KuWoQ+9$%JV5B`mMy1XIyD+c&b>_`M#$Us;||Z&NHik
zZ~2(e3xO$jVe|{nP=<Be2hZ7P7xjkxy50<CT|?}|f)XD8QXB1rnG$1HmSkV5j7&UZ
zJDqwG3?|V`BU(@$;kLop7^{bku>QOb1eaz<tx{<1WW3H;3iLEyU{U@@pz$t?kClSb
z6PFMp%Yp$R#<N%`U^WMqH6%FR$aLE@-Utsei1v-{RWQ)#K<*|Q*??Xq8sSnMHUC;Y
zjm6ALZTcDeBa4gY@Atd~%-^!d(g(WBSQw$s9X!;iBd>;3qWd<)sA8|<!J{65`RiaQ
zF*djIYh14JTNn~+dDIxsb~KjXHI8JXuJYJe-$0c<I^a^p4u~%u>e3)5kWa=NEN4@?
zbvH)pey3s#Uu~YbsfWTM=#(U^C;`M%a8a5vS)f_$VyfKOIqznIgSvForS7H>oCs93
z@~CY9J3FQnr<E3=k<sKYqn9X+kA4mN0InJWdDNRc7^m|VR(YEwEa0rOn51l@ZA*X&
z4$V+u!_yjdX{O7sYQmJaJILh1cFWzXnVea(X0R!UHT#E{0wwgMm89v9n#JNttG1*(
z)N~y=JzL%MmO&QP5b4mpy^?T~8Mli6RG7z!XDrIu7J2N>$RH9IX(~oQ*C>-)CC!>9
z57z7!ZGww*6w{Jgrg+vYt8KDFq~F)gxq29oeyco4*4zZw74|^cxqDBfk@Tu-I*q+U
zVoXk~cZ*mPJFb2HSUc}2(0hrv*>5J<V&|lNu)4jXo(a!ob=Hnb+W`=t#q<7wB%^`J
zh77K6!mV7Pb_0_Jk&+e-P3)L*okk`+2`EH0Ho>J|dT6C?6B9ctSFb6Pw@|N{Nspt@
zU3)YA5K#F<&pyI&xaKByAa7&~lLhzp1CG6{Azn|87SHNXUj4}DmZq2Vv`av7kN8sf
zSnppSYsQ(%aO_p%P5x}`U0R#iAu?f18<PUbbA`R8?1tVIa775-Zb3QhYil}#-KVrO
zRh|^gYj1jw1Ip=O!XtlVbVn1OiXtOBnZ96`KRcT$4@E8QVv1(OZ+8V9P=5cMx}jhz
zP<%rS-X3NWayrtUgeRET0ie$bzvcEL(S(O7g`eF`HVls{*~AXh5Z4~2S9oed-P80M
zS*z%2dQYkJ**j<37NGKgWH?CSg7i6<bB6Uc?Li*PdYj-n8#OIuAFyoN<wMq;XL~4P
zZAO7bKo>PRbOEBosJ^CB1n%C?RC&K|af&Gk{gnJnHQ`2~P<Mc7E%UeM=_Usm#T(LU
znmi6FRd1s2Z1xvw3^l>pJ*B;1xM?ffyKj+YTFK+DZ)cc%*{^Q%Ob=OlkSsI(RMV8Y
z-%)o+e9p8!IkLeND%4qT8qOA0_iQl1ty_wEz$R1W)00!TnK~&bCvOKm8<Gi}Tt-eD
z1ydg5WWzp_iEXA|JY>pb<lCGy)xr06B%eE6O@<KD)=qqZz>gH%h?EPbub5yJijs@I
zW!j1BvIEX8G<F8=s^b#_B=!TiO_iHa&QyD3>WPxh&!3av1wl)Sk0Mc!MK@t)dgZYx
z2qWJ4@oP%D0IOyjEbl3d+b!65YV^$1i*dc_<!`Ppm6>8u+~!_?PP+;{r;tlBb5)`F
z2h$^V5q<iTgB9>$4}R$T#iYg_?e=?*ErAbX@xzAiru#@>*GzLHlS;A7?1#`RyOiW8
zAQm4Lp{ogjDzhDyBCoyMOu7Rj9|-S<iSL|HmNV}G%M*C>6Q-hpI&(FKa=D#32?yIR
zV@k_T;wll7S*e>@Nq*dqR0(w*%{@?N+DQ{noB<}=h-CeRt4`)Sc&KHF!S$dusAi@;
zRgLh}&Fp|$*=;uOVv^eHV}^VCqN*NhhrIQN#QxX`sz4*meBI1y!6v}W-XJKq-*clJ
z=5?n!BetQTW+}7FQFYDp@TH{L15eLMfRgo;1+C%kKd>m)vXQwDnn2aO%O9VDgxtVb
z7zw5DJkI<X1?YOW{NQaEi4%^*vAy{<<I<+Hc{OWp>t^1>tT4Hkc`ai>Gss+r{n~Pb
z`5C<DI_E@JA!@vN06T`ZZi4wM59{^d#_S(i;h}?D6qC%#|K#MbT(_O^AoP0pa*F^U
zX%cOR#N%B{FU_lH3g&5|5C36jvgA3ee4LqVZp)IhajJQq1fDvD;$_b?^JC1cE2f(V
zQ4~|9m+Q<1DVICr7LrJxgv>Lm$>x`^*3^B5c`>Uie#|w)=?xK&@Or*khi0;yn6uC{
zB7UqYVz;=&hq>uM$c5Jsnu->hqp;uig=Tvhy#xQb*T5-rNRk0=K*uaIACi?%0`}l(
zuV_ruG0V+IRa)`Kpwaw6cQBfkPW0UtA!NO|wn7#szNwJ0)jX9IXg`jdcWP+lSQuO9
z_!kh?5&FotP+DSkW#_hjUN)C8oxQzge#PR!gS+MuoJ%~b)z#@R%s-QBhzZ7*+<y=A
zllKqIr`dF-=(X7vXY{!*^Aqd9E`z2qx#Z$yA$LAR8U*K0<|Y_|0>6(e=?hvp0?VW!
zzP=>sBh1>DTYC*cL0!4z7hwj8!?8R1`L6i6wZf|^lXEuj)qqv4r(C=qA)jQc(d!Z&
z;AYRJ?}mWM%!RY5MmTrzz^ovpMlTz3z~q&QF|MxJtFp3*^YXgIj#pK!?lsF=_BY5>
zI2`U3rN?t%6^*>sF*je^+-u(NE5TmN@fxzEd!vk0@Byis9Qo?)OWssKm@u{T`jd%r
zQAaN|yNtZ9o0qq>te<EgCEdN+;zYY|8;^rqp!V%tg{;W)@y$>2l9N|IV8ng@@Y;hU
z8s!e2lLsXJfs-C0iT*!<M9);OMMz?Ix>qk&UH2X01&;;Ok~3?R*H6axxGXRBz_AcC
z!D|NN=HL{sl^Ay(lveR-3$u(?uw5W-z<~+LNtPr^4lmC{*$B&Lc=cuT4fTAlSFHQD
zMP4EJ+Nn@&wbwIN>AP<88llJQ-*knDqB>dRRfVc%=b-xkm|-AeNP(ZlLL%i+TEY8>
zS5>sH&7&J_r~|=fiapi4F!8uoJ2tM{#a_>tx1KudCC6HOZOI)k6p*2J(Zy7dVRr|7
zC8fN~OC=~DdQD_cgXBH+3PsCIb9(bO1|Xl<l0iefeDm*m!NVZRD7~O6_i84mhd_6R
zJ=uB;Kw!Q^x5z27X&=2FuxU$rg;xo_iL^iNT*w?)+4$K9<whZgE4{bc%esmF7vS#Q
zN-LWw8r0}O?{FLKRQT1kV#?aTAS@4RO1(&URNeakN8|acCYoPjH0`azFMr2)zh~9|
z*#_Q?aT;7LW^(ij0E$uLl1OX|?^6s&6Ys5I>+7yPydCkb`>KXB!)Ad{{dD4s>5$4=
zc-shlQoOe@$-1U_r?44p&QR};>_L<Vqr6w(iBEsmRUgF!-x=P00`nAhkM)M9v#5oA
z$?~p=`Eta$wny56$9{AqvoCr33V%%WPQiz8cHNt}PX^uh!%I|Px`K3uw>u*4q{(@@
z7vk1C$X?>hHq=sNc2RahpIq-_EcgV?_wK`}5#s%aN@fHJ?q1>T>xC=?9QS@*Szw*>
z?#DEgbIE(HlFq4|wQVNP0U{f#fCNa7<6d4u&s*MqG3@FG-ewm2LjLh)$BrHn=TToE
z$`v$1XZz+91xI9_cuU!N%-iq0+hd$+T<dAE_zvP|n(KYZcjqWhNdM@aj+;R#jnW*Z
zz|f0iBBB72=@O+8OrO2sm@E~a@YTD7F+Ba3_kC6zrm1|svEVkq(We%3=8f(?d*Rst
z(Wv^Fd}7S9V6nNefzKv(T((;)p9NT51bo@i`3aCsn?FtqZ@!wN6oOy84@@KJ$?KrD
zK7ANY(#5BQ89{1ypJe9C7yJ3B+1BFSVLo%QN^ok>(zFAND9M0tg8^1v*fiGXI2Ize
zowt5$3q{1M0Q3X=Jl-djG3PeJX9QAOOY$nue=c6+GZbh4ehrq|)&+;#=J6Z5R%lt^
zQ~6l&^3^_Y?tvDkBRBaBU_tHqHlHI*d|P(=WHK8LIq1`oiEr9zAEgZqJ9p*{eYyeI
z=ZP`I*Fs_}pr^|vK0Ddyr(N~=%9=U1e4>%5w*zf!UV}a!9GN{74EN)GpO-k=I_d}a
zM?tg<HeyXwe&5GNaD41D9V1M2-H9n9z`yH8ky3@)B$T}N$;HK-CYs0hw}6*i_D7U+
zzWE$u#@j{eTlu8jUYRdEFGH1eR_zOOWoj%QoG-krN1F)-Um2#3vb(jKY=KxnROnpu
zoO~e`(5^{tz7h=X{<C>7Ihd8&h!viRYzvCAA+gn>l)^F(-zCia>zaJ0uqblI+t-#g
zOD(>Y#qJq@-)2k)P9eUnn6&#v_(mg@Pi6a!o-nqT@W(zve5(p~YWh|#4C?Cn2IFqr
zveWxqCV`!d3`RxA_VDF|k)3=OV6i#HujuhoG4I1oe@H_Dd<OZ@G25j0p2ltYUqi1<
zasiKi<%C5@NOj0ino+(1jHQ~{zV+BrOy+FgEG9Dv@r8p0)P&A2^>t*a`DuagDNh=D
z3!}EK6Q@(5Fh_yEKSF@Cjqs?*H=l_!^SEzwX3Xc#`_^P8sJiU?jWunq`-Zb->}}t5
zjO?q@-^~S2d`;}vgD-uD=&50Ht0&M$^$yTlOr)wH&uc~53d6tqmNLzE`{~OL_z7_a
z3;Zz=Dv;BTmhsGH<J~Oj7=|C-STbWOSY~cDQ8pIgy{DysC2rzpdBb!)HPDj9%#I7Q
z3}De{Ot_^Fs&e}VyEh-fV0PmBXCyf$N<m)5M!5(fH7&m|^ttwn_;UpW$*g$vcrkeA
z*9W+<^`Z>IsM?l|tS0ntYT;RAi*94t%#`%9lO+?mCj%2K^{5Iet9&qLS{`TjwQ%HT
z1Hf69U||Qve<WC@v3}EgSnw}(h<_`j5mhZY-P5uSl{~hW<!@9S**e`)i{$mS*x{G8
zeJ#V7QcbBAcv6xY#e)HsU>yB*mx+&5U<vS-8Q?*bAd=B6%3esyu-M_}Yu%*puK}yQ
zB_vc47(_x^M8Vq@Lo8)<41UJ76Rja5T$PaRmxA!C@MEZ@az)N{n5FVnf|kQA4RGD2
zZsXOZHvr{2xDB+4bs~31Sl%Pb*hSn|_zF)zZGREWP<o8AuoWvIXtV|1!lHx78DnXV
zDA)f=Tlo%D*ja^K!XS8ZCN9cBsFrE@&g`JtILmyrG4T&eM2aKj$**{*`t%b(;23r$
zSnjfl_tzxLP9`{Es^tTlrGa$gFycsNSX2yDn`?oEJrTiwmD+lir3zxsm}MEotl`ak
z%P|aS;r?5OT?F~TyWQdw&_d}F%NOPbca~YUFjLK4VM)eVyGG9I#PySI^!1t`Qu4!M
zBZO?QjFQL_#Q73EUmLsG;{30z9xAF|8!hmnr}*}_cuqKw0E{o?i}c@Y=}cKK$xQm|
zSzS2)^PU)928Y3G0~Ui|zr|8fmwuM7aX)8<rDKP7qN>9L_}qwXnJoNaiDB})W8*iA
zc|y9K-!Ny{ZZU5^5A@r^R;X7e`Hl6Y2s_XHd@=zLR<sl^7EyXm!l*3242Jd1EWgUN
z`l2O%uUG-Jt;lZ?i#u^A{8W`O<D_35Z<O<r7k*6{hG*aWX0hqx=y3n1^(onxLpFC+
z!}+2i&BXVznhX0z_#0XNzc<<c2Rp2H|1bYlc=##n>E5*Y5W|m}#CvqjN!cGz)jZzg
zznppbl%xKYhjyNw^&fzfg|(e8!O|tQsD#V@tytjNe9iv{+g{l6$lvHr`FZHDz0*ZF
z57Mg^+1gc%#I^we<YHg2%I^IkZAVE1Zt5uD#F6^H?}MX$a~H27LAJ0BFku*+>(qVp
zD9FH_8qp!;r$(uSqmBVNtS)Hf6>yp{>lzfWk4=zo)DI|R%~_oTc3?)TaxAmQOE_ZU
zk|g?3b75i6fN$)$BZ0x=w(LmXg?;*hMB>bp9Qm0Jd^VaGFo+S^H#5M<n)mVpY_ZeJ
zx%*e%h5))Q0LA!n5Qwqeih#$=T54|&fY)p{iPmD+7I23}{kOXUaw_BXo&dOfK}DK+
zAwZ5tO%H`7rA+}=CVL^)Gk*fsE4KsYF#0Kv12(hV?)p676SA`ZW;!<$1Zf2mSiV{q
z$ru{tETnu0fY~KQZ}TOfHfE7sO#&-9u_ywAnT>bV2l}D7!fOZj84m}#?AQO#gvr5C
zQEK6lOW+JVu5=;cZee>kyPS|A9!(4)v7@7Wg-c<9-JR%)=%SA<sXEviyG#bP53V6(
zv<Qr3WHxpPbYU7S?hz<sFKm@$2g36AN|4j=8Rq;hu2xomt|wcQ{F;-ZC4s7}CK-QL
z>-X9{b@<B~ufl#fD734B_k|z0o|bdgdE2oYcW+LJv{+KF1l@^v-t6*g?TxG^bKGxF
zo$H%&Ho~LMw#`$6&d#aYV#lNX2hUeaJ20iyf%RugKi^UHa%_2H#qihP5}#~aKYXm+
z{k>o2x(|)6I;`*e^>+Kq9|jCPTA_$tATy2mn3ZVhec{8cNl)vHn31j>|8;EBv5Rej
z`n>)Sle~O?X6cDnwTG=}QariShl5UU%MQLY?)mv@)RPfc?)fcv?X^U9>V2{yHzB*`
zu$!b_RD&_APFziY<JV_U(L*<<ZJNG@()gr9eCFph?QANRSKG4Y-W2!IUe{+O47=T{
zWBS@txxOLC3|Xaq`<HkW*0S`>)E91dI<aE-t|@mz$M1Rn(5;_WzpH=5C^Qj+$2caO
z$hA2<hSUt*KlPulZ$5pRp}VU&`f_0QUVhcU&#p6tgHvOMjoa5G$$i-ko0#t<Ww)Jj
zEKh?k&PnQ<la%0*P~~9l?T4ce?mV1!fImO3TKK3MWBNX6Jf!cV!CAgLVs=`V>FnPv
z^{?$cz}Y4yZSWWJG)KAm#O~df)61V9si%2f{<vrKA1i8gpK&L|eC^o&YHl_^_lNG6
zcbBdByuU^8_tGun5-*HMjp*`a)eX%UqgURUk4rbMciVomr|ak;`or+F^u38&!Ws{2
zeBt=gdtrTRbdA-_i+VPp_O#>(Wmb6S<L?`tTk<w)=qlv`m%>2HyT%vn_nGERSv1SB
z$FpzQi^3B3jK6pwxUQy7L#ga;6W?Qczx#tGDyu%}RpsjRsUQD7KfEleimm&_wK-we
z-}Xv>BAsD%D5t-NvTe4@kOXOrB0=W3(4~BK_MSn?D~A*-UW7mD?{@O@5To;q!I%Dc
zmsM8yWAeaRcf09RZ#!0h_wY-FQ_Qm4O)RIHSaM!Pl%EXQnEYgBl+z-ofuU(6G2_Ls
zn>RZiz2sOi;nB*4eSTeUvi6fx&V6_HDz|Kc57(&^UAN`wDXsYKOWel{%=+>0&d;AG
zZmu8u^W$TZ<l%X1*v1n-zt2_0+K{9J1(OG4%{q9vFsngXX3Dl#+4m1cmA<b2&zRAz
zYjd&n&Uxt%Ctp6$Z~MVIpCq*>`G<a~`n`C+!tY!IZOc<dO?RKVwW7uJo*w(Q*BQKW
z#5y6hEVAbIuGZFy=GI5_Me7>XKjrc3uDvWmvDd1&Tb0=9$wThg<*Cj&KDhAU<n)rK
zL7rSf(|cQ&TnfBbn)OC;b!6ef+JB6$uh4d161jN2YWKvDDU$tF>rFeDVl=JKcgcM`
zyJwxARlS<z8<%*0xBR-lwDA^y&GMZ)=7jLyQ@^!o=F$7*ruGYhmdzZ{bVkRy`kOzJ
zs#^ZGpTGNc!`W^p&%3v;nLgHYVlR8o9rBeY#&}FDIC+2kpzUS1|91F$^q~%2Qy*E?
zKCE)Q68dINr)tL%)z93%6a}c34C<imWi=-2RjYiiJhs-$&(($`9m}8d?8~w1tK9DF
zmo**Z63s;zb0TK?4R0_mrftQPrxQX?El*9>IU2s-B@-tH`eqHBR&_(<O-bh-!E5|R
zI$RS<zOPJ8E$Gwd_7{&|O|q-x^y#y&|ITVVpAIY?_v)tiTc4h#9Rpw2yK|*@a>cuy
zzuI`Ly6{ffC#^j4!qERm*q4A)^>zPWQ|37pC4^)i@1T+?3W-EYAu4H5G^r3J6-pEi
zO0$Xxr9`PTmqwaXX^`fm`bLxgKKowIx#zC;_kW+~eczssb=JJrUgzw6&J8d2-f{ZP
z!tdK2suN!}JSs|%t3A6(zCZ6pLCEWd{wIQ(l{cN!t~_%5P>s%;c{L^_i5fN)1&Q0v
zy1vpg3>g-b`ZZ$bno{jG;(Dh}cn{86>GD)E$aKJp@gJYAsr=NXA2_cyYoWox_5Iol
zxyw=>-8$Uw@j$)h_05XfAGON#8n=h&G)IOPjx^~TRjIIG`TDvcLrkYk3YH0v*=)XY
zv%~F%IeRKr#Ep<x*jc};>1)u1OuK*LsTmKV61nzEA|H6)){b6aI6TxiVe5<%j~zE}
z51Y2>@YXpk7t;2BXYX$73LpCCxSqrF=fBqf?i2sU^3tmao6!2pvn%u4$}*DA7LJQv
ze{b4peHq<T?w`+vENUIM`ed5?WlIa6>Hl_~IW@BDW%K7RlXaqZ>@;lndFAKJg}SVk
z@!Rw#XH{ygPTYO?)!F<rd5?5|SUs5<JAZS9$K3#L^%XA~Hny~9oO^dH?1IBGwxWA^
z#-;MO@+sRAyUuN^Co&@4%KNA#Y2}~EV2y4}cbc~TS>C^X4H@he{gt(@#Ynd=%}aMY
z{qEyC37H$lL%$vUY06T2|4Z+@%Cr|d^=;aoeK{-tXXTN+!rB*!QmY$JEdD&+C^$)X
z)ccglp7U(fW>{{?Uy~)iXwRS8?Yq{>PM4e5^w4C3*nml$`-UDn>Gj8PueZCWn6ppR
zPP0X`Z~hr7^EC0q@uZ7shb}JOe|E7(nQF|wjm_gvS*1lOci-G_ddtAhsr|JU>*<#5
z%YLQwylu#}=LhxPKFxUC`9!>Oj>DXp%hkR{%$q)Zq};)Cx)OF5{kQvGir6yk^AU7p
z{YQV}*o}`(lU}-hzIIa2Xz2JcEcZmiX`K~^bMI_R9Cz7iY}n^!RpYOf7mw^b@o&Sy
zaPhlFvx;RM+MlN_naj!OztzOXsA|52gZ3pubb`rLD>3FW>%!}7-{~)HmF(g*Pd9Y<
zdIyfTcbGG0_>4Bag)@dXD(+jj<jdx@bKQRJsR-+wGJl)q)l0`7=gORwd$Victf2J^
zA6?vIasSoz6;W~4!`vD>TO_}HzPYP4^F#Cg#<d?E5<V@PXj)}XEHIsY>(TBy_YBGB
z!{bUMR7aKRUue8{Y5Msaw_R^*`hAUc^$wE|Z`I1T&HM1@+4cXX|9n3oR;eOqVDg@*
zP}|5mMuqE@2Axfso>IT0;QIDKqn$3gxh-y6yLfxsarW53pO0#H{p3_^-OA#!T)SU6
zuTkli+K||Hb8IZJKF!qJ<nyH;D>j5L^ck&iR`0(zdsR1Ah3-v%RZ)FADOqen+QOj|
z+aG<o(i9UB_<(qpW;!@AQm#N>Nq<-2`Zlo+Gf5?jeZy}axDajs<Y#QGM_Kfmwu;HF
z67H<C`io|-4S4?g)EBvDOAQxJ^|a{pGg6!!t8%N{R`QSJ)Inpz7QE24+dRg|FXBew
zgV2ak#O2Gu{|&G!-FVjht=Im$UHzXwe6GDCD`)1YK1;v9Q;OGW`1<Bxc37gzze#%s
zOVySRiCl8*#E@6#w%9B>KH2u2bjg|La}6h@y*v8Ne5%s6W%m^qNA3KP6>Z-qV2@o!
z@L|jS8CB|M$EYf|PCED^=Jn?94ndplg(W!c9eeED{h*3?g%L85gH{tw*%t4Q%0@kq
z-}vuFNZ_@E@zW|EH2?jj748zcM=i3k^H3sh#-F@ZvHj0`wK#6loyMsZtG^oLYhYWj
z*lL(f<=wafZez^&J2oSf>Su0R_c&>nMx}Qnt2;Nszr#vWPf}g{hOzU^8%ZnISEuOC
zUAgh~zz3(6XAViO>GP1=>N&xh=cTgdlg8ySZtCLm2Sgg{TaGYDk;uEa?nq&p>g0lX
zMZ5J92Y+>w9MD{w<+i~twKBC>>iw`E+JT1_jW#$neoXPq-3RQ}R`2pjGa1)k%}=lD
zWM$pm>A$i)Sl6e_j3IW(jXWsXaQ#_POyKvX`HN~?3=LG<$IGTfpPT7<?epi6Y0gdN
zYR88TxZ3%4qUGP~X`#j)E**arhD5qpkA0s#Pv+)1dA~#z`{a&)6LzFMwD46G4|jJx
zcp!>XZNIxCJ8ABLHJutWlJ6en8OWT!wrjd!Wxs>NidQ|{Am@FyK+8Y%>g@q~>JO%s
z^qDf|PTPa{^5Ihwy0e$R?ZbKP?)ki=HR!~|^12-t7P)v1)qXL}+&v-Kr0+V-%ru+R
zTUW1-yEc8w_bs-E79}d!O4SwKA6GQNKYNztrL|sG+V_sQNdFr=u&wy{C9N#KRc0f-
zG^I7=-i3{eEO@pyXN67mylqCES3gHDU79Xlw8q9{fZO6H5&h!dEs>eiHOFhMx4Le^
z0N+U8eOE?bnz(KCv7_U^R7G82FT6C$kK^Yy!uZytl8B)>cVu;QF6YYzCN?z@PlCLY
zJ=#{)3~H5}y?DscPM6O|O%B#-joLS6<3ZCg`rQA52E<2Ie#*`?v)|`wtQYJ!%gI0e
z$sW}Tm)D%KE;;d<h>5m`w^a-ocs^&o%HQ?Gbi;`U6B9i~6X#bpo;Ymr{;%`3L-NGC
zhO$h{Oa4RpEIGJkL)IacrnOwQenjF?oxkm$^q#og(`>FCd8XS!Ce%KxqTPCfnEa2M
zQsY{RZcPl2{vPF`=W+CW<&Vhnk*?=A<~t71W2<yoaV2gm#+DDt+-vc7o#F(usb5b>
z*RJ2M^ya|wt#%(@G=GoY_Vz`7n04!-bu%w{s)VP<4a+U8C?9>mW%PfE%N+ZiB-~SW
zCVKXL-)D2T^-Gt#*Gjqd2L@@CJ6*f-_p<thjg4M%SF5hwFQ|GP>7O`dfb*1(DH9T#
z3bx!;UidqpP;9$X+Wp^E<^7r@rgzU5^M1i)O@90_;h92dM&tej#h2%`7UV2gaQUdc
z#}&I1`bTRX?N?1xI<;UI$9JCeG-=k6nrrh{A8?6G%)KBUdUM`7<$ZB0tWvZNUmQ8m
zdAU6ilU)0ig+4=fq&D^T$o4%h6~RX=y)PekT9m2u*5u6b<5Rbtk6&fc@Tj(Fo7n_j
z>o=?Lj{OS86TB+=FY@>JnD*=XCZ{^%tW>i_Km2!OZ&OOQ_KyBxX(xW`*D1L*#Se8t
zHo8y#x$CjEUss}mWSU3r&w;+Ba_6=yrD~7JoW=5R8#^*){<t#h=Ox8M!lnj0iLGik
z3cb-k^mdr`$O%`fPCoj+RlmdedPjb$M0in#)SZ%oX9tw}xS6;c*ktHm+u^iq!<U7I
z&X>hBYPR2cs=8u#_Lz({`?gp}9^9n1!rpv6FH~tz*!x|LrJe^qCLWxj6!LR!oVt=n
zVUq87iAmKRRpL@XhL&evEC@3ipL!}}t(<kp+w8wKtEXq>?HpIaElSpTx4+LgyBViT
z-)30qvIdMb&wIXOl~T6j;Vr6BhhOFWcrhmbr=85*K^yW$RlS?K^Yt-3-tw_F|1Idh
zc*oe?QmcQ&Jl5RRJS@yDznt|gQF7d(fU4b!)#X(-O>r{e|Gn1Fsqr||+0Q+wb=&!)
zThG6Wur=y*@E;p%%ZtsqlI9zC$m-mXulKGd74LYuw$0CE?XJkRUZb6Ej6eJ)Amhgm
zscA>7S|#u8U#sk^lj9Y?T5eo@kgN2s)wu)pO6G=-`}taf9lmw8QgMc^zgETLf0c<^
zvvzsehs`Y(SBy6g_6^V&=@mSEP=617lbn(w?xTfiKP(Dv7Fwrdc?I+_oX{50ziHWq
z(5`kFwb0Se^0<ENY2{-B+h1Q9k?oOnV(T?~rC|2UVBH{|*%J39o9Z9#eeJefhDc@&
zT9KNp%f75Vu~5D$uxMXF*wVs_13EYUEg7EZnVF!WZJHGqaxK^AV46(pxT6<TwN7s@
zAKPWN(%5K0MRWAw{atJS{#STnxum!0M9udmzQtt=YWu5fIPm)S^sr0vI(_<_ym#iu
zr5zFx`_tEF+ej{N8-KtzSo`ezH#zs(`%F5eUjLG{PmkCxkuL76P-&A~Q=2?fsd`<@
zoY`CK%o0OblGBceeOW!H`GmVZcgTj@2d~FlJeyUtzwOuNt)J%^t<2xGXzkt`Eoo{C
z=6<Q<)~NOKKc&l6T(W41l6mvhaJjhcnd$q(vT_Yx%YFB5%DuP#g7FsdV>XitcI2|E
zU%w$vlo)%gSyub*dE;C;oqYLwj;6jt8_FNgIDF&Lq|}?N#Es*McE^n#ShaLqwb5l2
zvxs>P%Jwoxm5v+<Khj5IQrFtui<R9n>rbRk^t|SLX|RIN5811`*^OW2A3t+;9m7qX
z(7qr~_q?U!qWD!qW$*r|AL(W|`u^?lK2hAc@fw5e<eR<*7Uz_7-CcTLJ6>vC<HlKM
zW=`~r%X0D8dQ@EZ=3QfAen$KWu_%cjh06v#Jhfth$HFp`+ge4ty4%xyUL<A*OnI@{
zIc&jy4$7zcaXae_R{UPo6?ZSxM@xP7FHVT-x`s2|6Fes6J##L7Fn^L!e~ZkVIrkm>
z0+#GQRWxz@59g^Xic9R)tzD?}=%(eH1*LgEY9mUb=D19iEKt#TmMo|ER4dUV>f(CK
z9JL_R{wL?PelGfP&aK$Bk9*q6wSKvkvcnfGTVc-$i(da{_LFrAk%JZOO!BOy^QBBy
ztGtaJciSzYd7knOi+g8+%^vK}Y*~MIZAi&vrv-C&zUWxE#7X?jy5iOC9<zNyYr-F2
zGO&85*p`}H(zT><aiqANcHRyJLS@>y#7EB?o`!7sy2&_u<KHJqK?;jkcf@KZXdAuS
z*H&BgHfL>7;Rx1;7MaKX@*l!Ax*KokRGleL;+AhA@+Y!$V#U`kIbXV$tGLF#y8I8T
zg%{apHJ7`3{~BB0=7+ZG)f)%h4m7ZCpO^RYZPL==<sEYZQ@ss_l;^6?)wmY8$Lx>c
zs>C+c>17|;oo2VDKbg6-V{iXq-3jb#hCixn0*)_DOL1eDxw}lcTac6gan00Oyj$gt
z1O5i=Q2)vGEm`Ez{@mx{wS)8X&MlpIBlniem{JAHv^V}qbKbb#(;7X~e$I|XpXM9s
zznXK%c-$!8hBL<-Wejbv4$W}fHoWEB*f-ynK73H3oY;J#ZHM>m#`qs?FV2sOdU01x
zdH6)HiZs{4ce}Z(2j73y`Lz9B$;-F{mkz$&5G(dQu|bmvS-Ah7>kX%dWfGSk9Q^mN
z!EoQ>?h3Dd%Xb*<G|nyyFiQWaRBYgNC-&Gcow_woO_$!>xwO6BWwVi`_#LgxgEiB+
z&qJkVs0?b;7)zui{@vrhMEvLiuhfcqnUpOBr5Ef1Hw3dbDc)(kk@Bx(h@a%`AgPw;
z11;~w-mk4xU%q>BSfQ_{;m<$SGbM|nD=K#{>X3e2c<;d6yN1D~_isst=4zB_PV$<w
z%jw05(%WS_j~}S780PjQNcz*vqH50SQ#tO>Z1(8Ys!X_Dx~6kq|K>!k5BDXHHx_ua
zisnxJbuVq|##Q$_R>?})_8-uZa;JRksE7T;>KlXRbe8sA`Efy?jDtHl*42|Pp4t-`
zc;Sqe-TmS>H}fXC>AhcaeSPloNE5E<io^FGl$dUuzh>ftB6+T2!i>ASlC1*tg9hC7
zOLN=eIU_gskmdP@C228iPGI&#t;8C&XUeU=*w+^*24pXqJjJm4w`;fN*8So?COr#y
zH}Q$fK^3`TbJn&(my7o%EFN;-)`4xEHs!|UUC9qBr6yHMl;lK4NM8IH74WHWWAMq_
z|7|)`>G^k4huV+Rr4zm=?@H+C>vcD4dCQ4cGal`48NOzfv+A#bRbdA_Z~va093o}w
zl^7l|CvS?lt!Z+8KvMoO1zXb;&7}LvhLfH}%w1p|pp^SBbjG|g>yPcj`n&ko^c`~S
zr9XG^i#ZEtZPxg_{z~XJx#jh0W!Cx$Mb{QwT61G(*0FONOXX}=9`}mNQNPhmbg#>G
zB)(sFuMeCWkm~ufsv~YxR)yPV$I&y-9UDC~UM?%~>VOXoHmvAX_bf(d9XWPRWzrA5
z!LBmjY_;w`==<TV?8-y`QnFhgRNOt0R`h16Sx))PDUa`}jh!4&aQfD^Nv`MA#tyrp
zZ2$9=Zs7fsZ{5D$lZ?{bRhG8PTrB9=dQ10gkL_CR%MP_Y+Sg&CyHfns6^Tjhzoffl
z?O*D=o$tKf!t_SsxfKx)(tR&A9Qp0W%kZebOk{Fh&-+_cTh_(Q{Gs({Tu#9D?1<cZ
z6T{`UCTAXdZ5Z%ybinspuTF1?f1@xkpxo5>OzF&&L!R$L{QaW6WxeCh|5#ZRsD5zR
z(=Geo@6&lx@jj|ggktEbii$$p*Uw{IuXR3qezr_TZSb>dla}I>;}Z`*9dPr*>WR-^
zwu&7HJNxDIh5{9nU^PYWT8Xi58*cs5=l$?;71Mp9weDi|sG0vZS&i~eSrDl2mh=9X
zVSl^Px896eH#dH~*Ayb`nZEUZ_7(lIoTqWl-|yI_>c8rtukn_SC)$2LN2oNuOIN!-
z<8V>MxWh7!OzN%<u?$$+X;(cdGBIdr*T(O`xy!z73N;w*UmPE*^wRgVaaqjP+wYIx
z@cb!XG%ioWa!|<_=eHBbpA9<s%Y>`saOrqP+*+}9jjOiGf5;i-DE6jUp(~C!oPS1Y
zRKrxKvl@$BS6N>f^!dN~!$W^2C2kJ)n_pdjFmhq?J>R)g=2!0yi2J=)`CqkVNz~}>
zM5W(*9=reg8T_rP-obB3*R7{!+~@|spr`$x7k~e=aDu1B1e5k3aowRizIV5JXpAz=
zjQIOx>-{&M`jq<wJ&BHMj=tU<Xp?ngN+fTngwDk;?(<!Y(+`g9uUb5<vo3o7K0l4`
z(`GChduQJ){dWTcYJRql<vN=`opsM)!mYjU`$l#o{yCglvdZo5R)6_X9+fTMPS&jV
z_j>H`R^y*(NX>vF%W~%WCxrQT2HJ16PTwEutJ{$NrbBkh?R3w-2d|g!@+>brtT!*a
zVOi1CRV>fkYa6^S+0QpI7-2lJW2weDw$}@TjbibJkJBti=I_3?$ozCbTlE^x>Gvnh
z7pt24wD{ZYVW$(ToW#v*YjyA3SgNh*yQkY^%>JKe!luo;T^hE1;jf|nr7Vx74D^Wj
zvu#E3*GT`<`8#iKyXrhEx~s!HJE7go|Jxw#ZdvQe%T!jRwk%v#>0_00Tlc&3#6j|o
zoo-Izi_%P|<%XR3{4;8S_2WhI74^f{JsK0U#;<W_X!Pp~sjXM#{S$rTnszn!ks8ve
zul6eTUG2ctsor^~4*Aw3kDYi_s%fO`3B#eIpVTZpWxT^??~tvrN}p_gAK0pVweXL@
zi^IpZ&i*w>`{&9!g_kpq><sOy*H(zTCA;#~^){=R`;x@Y9p`M7U0%*<ytCHn)yw!n
zL3NWy9eg%>?C4!x!QO#ZVqe(R{SuRYeEk0Llk%uP1D@ocn!oSp-fOp1cet*STt0HC
z?GVme%f>64Uzki%`nvE^@XF^VKXd=RNm^AjX!YiQ$}b$<WuAZgw)bbq#&kp3KG9N_
z8`NI1O#7XSlX_Ys9%;Jn^P#UlW&I@HTm6vTDlyhOM@M$#wo|N!KK}w%h}oRV=hRr-
zQud5_9J#6JdCSDAi&~qcKT18-H2Eu|bUk43v}Yr1ysy8$o~*NqCI8`N`|CfgQ#w9;
zY)@}UPTxz|<^5gyiupmT`P)rL2wp%ubkua4VAsj>v}p?e;K|7GBNa!Z7k}c$lP{{9
z6KB`(23HTeWjaxC+9a{X^ta&j-KuY<nu0yoZ4zel1>4;6a%TPb?|*4;tlB#mZE_t%
zhptP_1zWu<G|YN-)Jk>CB>6jP&U==}3`a*|)@bv0%g_e5cD0PLnYZ9v%oHm#ZT>DG
zvC_?KI)B6NnVXq|U>i|qf|-F}?{7?ynFoJkmze*-(t+I1Rw1-gc}m2WNHc-8e<RHf
z^KBr?<IJY<*(t}HjTG$I`Xrf+5OB^&HzNf5kH)*rjQA%QRlPPO7Nh<6KIn%FJ55Ga
zZ!a@z5bXQTD>pkO*t#@5V)j|^rk?gWGxU?uc$=~8qS;?QErH(pp2P=bu9y|`-5)nW
zN2wT{`%zcv-CL9+e6E<uq2JpyJ1cPUwdZD+_{Stmw>63nM`y~$tMs}UlS@^%w3$tm
z!p8%uxu48>J|d&pVRoE<v_)>~Bb5xa9cm=Q{{?gM_yBP;8*O6h{xHiED0`~YEQL@0
zaZ`3v9onys=irk=)zQDrngz!B%A0=>nCz%zZXo#QY!&ku!9SbT%$xYT>O&8hC5Iu$
z`HJHYE9B;(V~}5mn#T#87d_0pTo&(-Zh8A%b_?3!^&*^g&@ab0jWky$>hjSMm8CZ3
z+xa<DVmSZ%He|pz!S5F$nhOH8Wt_~b1ff3bX}(vGM6><O(JNm#2-Ez{uL+bh2{gaU
z|B&6VePc{j(P@&`M6{ndyV_~0xt=)x>;(EHMf6H(u({wz@G@73ndZ(g*CGA|n+wD!
zgqUCC@5&$lmVc!QeHY?r48MMKck%{S8_h6p6%>uiGIMRdLIiq28?)_~yPGG;IP1?+
zo8;LKIiSLZsGMzNPN=k)%aLS6`fhZX_(6hs&tCiQb><)Wxk{8CGIt@&x0(y`)qSgZ
zIR6D?@-Kn-_zEFiYW{`qgI}fQ69m=kQ)j-Je?oz;)AR;&Ervd+4n-m4&$3G!@bz)p
zkJMP%U~Vi($pd%HEd*tUO3Hy~A7!CIm?EY5!t7hjd%k=({++qK;BY~DxA`#sMb*`c
z76S#}QsNdTl%lvklqHMHmd{E15E94Hx9Is~W&2PIU;f#cs@RiZ(kL;b&{uWIeuW%0
zusYY$LX&@dLTla`X>oK`>?wMMxYV3WY<_31S}o;hp~2tkC+pvV&^^r4Afm@u2$Wql
z#sd8`H<>7eqn||x|J2cLKa1^x_TU_7k;wlJLFDvBOUT0+>9YL3RZ)R<`Sl_!-U$*>
zX|BakL0{r6ws^y@*%22Loa#`s%h4iBoJ*)w@cI&mj-b;UNpTjJ1-apsY%!JZx{_pz
zJw27S)#9hXbzUVFU4nm(t+!Y#_~)4i7B2iF5>FcE?U;{}d4vpkL;`zJGWVf{3W3g+
zC{@pFwOGc_f8tr2MKE8G<68>}DO`QM$C=2lb)#1czFC+Ng`axdbLf-BIYIHnv|Bjw
z549T&>4^0~=SW{DV0pRq=p;*Y4LXUX@ZBOz;J27gi{S#tl>fFsN5)APAD6QnC|Fu_
z23r0T)T*AoWk11TkP*Wy+xVxfs-91pT93SqzU)WZM9D}nv25b2@y)L}+!&edhmLUz
zksPwH93imyy_Mw@zQtp#E&uVQkQIVnM;vddDE#reyuY?Fr%@Sh)FBUza7hXU!toqB
zZt{m~c|{P~E6$e5f+HI~<1J4MIHJR56Zv8y{VgX5mXo?r%RoUR7_i7Ph~LQuM~9Co
zLPBPlkjHOqiCve`VY`G&C?RgeS(Xb{$geeu<m+sMbtszZ#uUrNg8K8=XvyKbnQw|t
z7E<(fX3tXlEYtG2z@PQGmYVu}#h%qz4i(g&%oWSc0&}+Av6SW?nR!Ztr{tkSQ|qnx
z#e<plxsDP@tl4r8z1+{<;A!;l`I;p9C=UHh98RL#kCtXc$Oohk`8RTDo`v5+cy{W^
z5b{OQyHdmpXREP<?kyhrjm?jiKl!KaOg~v_369`-e720_gU3EwPUeHvs_mBO4bdK!
zfkgUOOC>&0;aAHH{`q3UvePnx59W7TjuBTS$!lcq7iOZBNCGXMA%yc+OHU&DF0X&J
z-)~Fw0e!qkwEwXT7i8`vF{_^Q`DZ1pdJYHelC_#4C<JK*t9|DD%gXJoWCdA%YNS=q
z7jnlsTBQqUA2?fm;2*wj4cb-ii?UT&fi!+HG5iTyer?8D1qmoKJ+15o2U~CWSyc<r
z4FOhH*nC<3=~hO9sH1dq(TScvXThw+jFE4~ndyC1(TTg{F~sV7VtObN1FAJMt&I5t
zM}o>7BSRG96;AvNKfBpVrh4-(t5Jf)xO3R5L~!yh@u<~Q3CTg^i9_`L7xb3?FDq3-
zs~Wu?gZ(wE+DcX6FNc#>rGkq1blK{#;0JGaUbm`M;dj&Er&j34s!3Ky^R_wak_X`&
z@mWaBw7UGY6;IIfK76z4!#_27<kvKV|B$yVrTI(H?R;x_qPUf(dSL7?^nqi1;Bnkv
zt7^dse`y)(TK<K04oBsZQNxWMh^LO~b|vcyek*>`qGkCMWx#3uUY2UaKYi;y)!+JA
z3xX8dRIk1eRo?L-{KKwDET)mJw$Zju<V%?M`-+YPlJImWvAWLMu^O=$B<N&=O|6Xu
zpW61awLZ%4&WfSix|+~;c{1b4Lzn14C*oP{<!t>you8+}1FWwLI)+@BHTu*DHg45y
z>qmkjo*HG{hhL+6H#W|{iW=`(4I)O`#*xVU%#+Oh%#*6Fj<s$Q1TG`qx>?|$1xePs
z`9FmI)kyN+Vl<swM34)oBVqA{Cqvx$g1S!Bdg}y1Oh0E?D+*4-c4u4X3Ids*Z@rZN
zMpDDn$YWhdx{4Z~$*FzTiq(-jt)=;Y-ZS7_4cWDx=#uRi-Ar7t)~G(S&pJV1k<%gT
zXX3aIygTVy@CBveh9J^|V+fV+=)C*N!`5AbG>omYR+8pBC-s7L&-cEI?^_=gyr(t2
z*?N*d>Do8e76NlWzO!D-|Mm3r*{`{Wkt(_9XEe#<$H?XC)u%sOUllaa^dHs<f(EG2
zZQaO!2f{e&_#Hj8;;6~dVz2xQy%oZfv@zwsr0aTS+@|xWle7`L$BCiEQr~)Tb%nIe
zeL)vVleLK#q_Ui*4Ud0kl>2zl+7>jGq_Xk1Du@<a8=dM*0~_@9QZmxVVmt?3M``Jg
zehs@fJ#LuWyykoL-lXQAVMwc38JulI=3n&cj-s8-JVB)5xi&qo;-!tWxyV1!tuTFg
za5dsN6Awayl-urP(<G%7kB$}I<&4T4sEa;4*-FHy@ZE#5yc@k$bI#qyP?w*8A0li{
z3sRtLp3ORe=eaR99KoxFF^M+y{I8Zewf*Y94_VkemT!WJn7w*+Qnt-W!RO%}3vI&q
z{$nQ(k$#I5`$RY|wm?7UKoWl5Wh23VC8zJ)X+OzVp!HM)b1?a5#{)J>f=)BxkPVs%
zae2s>+nf-b`Mz9fGm!7AJM)4MYoexcT!O0bYRhUHa{=Y_8k=&yq`ViO-iRa07ZSKl
z5@%)Xhg45LYoo$XQ0-SP-#94K4V7^!<jUEr6VGMr6{@eDvvCsCnbKvOm;BqM426!l
zC(xUrM(Fo<!qHn_h?!3HsB1Pi6!|%<{LtpGJpZ5duWU2~fqVSMW+;D-speYLwIH*-
z<@xSQ`fP*NruR1JMLBG%$7dU3!6LKltBtDQ7X;G(+7t>@F%!4Fzz_WUp$_kFA^)29
zBPR!b;ECr-_R>VWvb}7zcOP5SR4`K$1>0J|<!@DN(aX%_LRp<Rz_#bB^`{5fs`2k~
zlC8cqtVG^uSL8Q5Z9Utq{J+f7aAS9(VWwjQzhus;qJaG~v_+q`$Lv(BY!msbA2)L5
zg%@aDI=hcwkt8L7C}jwzzDTjpY}*fl)#9nUtusGJ<8N(wO};x>)R+7$33?ZtthRaH
zwtj-Nte;@(A$aLSX^L&4Ajnx$ZOi!1=(y;MKH!4-6?(0(_xkiuTYX9Vdm47CCPq!j
zS8s_s(gOsw{CkG&d%^pa@8;M}6a3R-p>2r3ZHJcFy7J!wym4d6PkR)#+w%OVW9>%7
z+xiHimXT;XOwjpiQ*6}*QNOdnwpQ@c&E_q(`Xg`)2$S`Q-HH_5q(Vily8DK0nZVGB
zM%&}ce2>n3X?vgF;~KWa7raLoCG?|Nx2esxTX0e4N87%Fe;#bNbrcBw&}Ew^aMb!g
zw%Y|alSVRj@%(lDfQ((MU<%1pwaXV+ZQkE*v*105vzm526U*R1cIa(5JPvF!v<o-G
zwluz1TH%Mh>Wp5ileV!TGN;=~S9ARA=1F0vr?;54kgw(TCqKA^KCfY9uUp+7Xouc?
zAhUA$zpi<<$fv_)_}^kgf7Y+wGu_UCUnpiNKP~@5ZC!2<`4K-<Xh!yW2P|jW@dNVk
zzxU5fP)%z|(j+gNXXh%gw=T*KJqN&fnI3I-QczKzOYAP$;KTQox_`q@A>kJC{D&mu
z#8d5i!0v${bf$;w@&)7kl{&j00x#8Hvg^533~R8{7x-htUAyD`l(JE;wdz+mX(-b9
z;Q*{v^{Nkc?t-+q^3`soyyW$*7lR1*zjkiYhB}KMO5T4$IP0^uB^RR41Q196+WAPc
z*vh+2?>!+@BM?k_o7jAwtwoIfXJ;+VvZ`v*Lx01UkRBuo|JluwPFlSvbmd(-Y|?EP
z48Y|gFzKHhJN8s**7V(Hg;?iQv!@BOAZdvaVh9!?CeDblLxriadhX)vKmfid&g6}$
zJtWw(0qTbm?8yK;S&|Le_*9ZT1)vU*VuuLB*tf5x*g=3>Bl>^}zV8E?Ge?>|Ntib_
zr%f917LCmu8K#2RvF~NrGlU(B6<jJy!t}V4)VXqqhiDX5%YkZ3$uk{@S*OU;Y^T!g
z<V%hgp{KxR_lm`T!zO)Q!PO%26xa@cU4IqW^8q+Z5rhquK)6PU9RV;Iqs)fV{H`*4
z4nRFig*_91zpAk30q|N?_FMq&rv~yqpvGPXP`jzKR|0UGI!K+;7t~~c2B`Nw4bXV&
zejt3hAE>uae|9vaV1JOwk^!_z#No#|>gC4MBD6KxBZPAg|1C+AnZFp8*FrW4Wh#aV
z7}g&MQkTQ<h(YXdfcHfh_R$6ve5{R36;*J~VD@qVmem0j%+X=T0@QZ8s6`3!#)Vg-
zi~2=RO;d#z8iR^!f*u>~meA`R7v51lT0n67$8K~P!t?;9zCMJ#KseY~Zn!?#=tX$9
z>VsWljsZJTm<i_HWx&iB%zK?7f<<|&4+YEoz)+;OXlt}K0-JQT5j(7xH`R=shJg#f
z#bHbZvEF{h?1g~Hcg83ydNiSUFEIfX{Aa>m)JsjR!g;1(aaozsV2ZcT4pc=i&Dfp*
zCx3JHVgUYP&Q1W}YzuaLFN_7?SYQEumPmkTKrUE<jVi(l6d-TSUfnAI+lvKMS+kb{
zyeHV8n(6T+<(;oKG&fLisx3%uY)7M}oK|l~ixb6U92*SC8#c25a1O+CKv<2-j_TDL
ziwq!NtHt$L&n4^8utk2({m1;73>Th04_tuv@xU?2)E-=7>g<^*iFx}EXLcm4{Hx*M
zqOoiQDvlnfQ~v560S+IlM}qaJ;Q-cpkpry?FmJ4&<tTD+U=EB#<tUK1n<Gk=9tEk$
zy>w)DG;C#z6O#!RDdmh56cw4{%#P{hO&M?Df*PHu$VwM5{9|0f74xAhNWEY*sQ2H|
zV2C%nfv}!C4W=B6^wuJ7xU-!BQ<3sogzp%(zp!E0JJ-h`3q-vWJQlRz+gQ+oH6Apb
zs5tfY1mXR3n6kig0@BjYi#<wM9jxUEFErbUYUw@>%)R^LXrZOlS?JB4-s=HM?oV&9
z9+!`2t_A#T@&PL>+lN+4lt^t~&;#YZv{sEZ!MzcAU~+G7jFIDL;|DOrwF%&0JK2wB
z4i<@-wD^(fE?B84E2Af(m0HMy*qrYZk%B@nE~Rz;jD~?fnL5kA;_x)0GJvf?*aomC
z^!k#rw={rp9p3`SNwjW7>4X}M7STS5?If%LR%~)0Sha5hX>Em)yJ#}4^cezlr+@-B
zOaTQ92m%G{38D$0oXwfa4(N3@rOwf*OeQ!q4%5gOhzwEn)0mZsW8e@>yJMu11Q+`K
zV6xEZoUqdzL%?Fb9Re<ZQ$xY(`4Gx96|0jtotCqdsajzm@1ij9e#AN)6j>h*iVT<m
zifo?&GD(^VR=wsdFcy1fk##ImCN>e|EJ|OWuw&0epqnvKICwS<rV94=Y}y<`xoT@9
z2%FAf_6YoCjye)%1;-B~rw1zIeCE;yS_*zRm%O>4Cj<`K%z0qz=$Hq#@zwJ|YUL=9
zdPfwq|6q|r7l2PR_AOvOy1>+i3sC`yF1LFYGN*Y=JuDhbwxiL^hZLCFHikLm<5-@E
zK?xxgOHA#u2)vcKv<Mu_ycg48syH_;M;-0+Vz#|-EO7PvEMZ0t%YC?nX)2};S;`D3
zR{s4`Q28axn9C2Q{<{o?NXS$Sr^GVvM6o8S%h?lqV<BYIL}7C<we$+6-WcAr0xW#Z
zmCW47R#vVg*L8Z@V%RYbtrw#3{Wx-^7NHJ|XEMQ>w8n#L(eebQCYV}f6_XmfYVRu4
znS?xu733x&CPJEECbtr4xleh}JBfKeg_)d5BBLW>&Y0D-DS)c*%d6SwBZG8mEYf)m
zn6?*T*dv*?)L;cMlZ(mh@x76!8rF!lU~!&+VRi~QIiE-YnXuP^{owdIklJ=V&9Rh%
zmFq!jiwz)r2!@BHg1q;qf=u+&K<eFTAobvMka~MMNUgaMgmYn7eG^EX3B!sRG-@jH
z^|`2F?aN^EfbdIhru8_=>FJx9g^#mYH4_|eax!UC48^2x7MQkqFg$1r$Yk3VCKIfQ
z&Q?S%S}8lWGCLEdHpwREK#{ttC1KHp$ukGk<Z2GBt|%+ra%nKt50KNLQXWBo_uX~q
ztMY&*&Uqm9`8<$%Og@tux3{bL%mo7Xg9!ypY793OfV=~@fz*$-fz-j<nbbH8&$lz1
zAr?8ikV%c<cZDGDs2w2nmmMJW;+-J%4;YT!1v2@)3uKb88>AN715zjNVdhCNxvAH^
zhdoxBHS5nNa#Igy6^0k?C7;ELz#V(Zv=@P6_Mtm9QTY8n(%&M~i;9qfJuuE#Rq_|k
z@jjx`ezvu=tAF=m^B#3@4$a=ryy?Jj=YDorANnAT!<=-08D<R29Aqxx7|w-ZgJNdM
zV(O}5a;>IUGKR;NprKI|ZYm)ku+yor+LI5VrYE%E<Ai%o!b0r;Q%97N?h>)`Z7Fk5
z!PIlgP*Mm{WB8K@jP;H=Olm?`5W}5^nNQ*|99PbSG4Frnq$VN?CRRWS=2nuE2b~!6
zNUCJI1WS}Y!kkku_4*@Bmta`-D3dp4l6n+WTcHYMl2OI92J==vMluo6dkYL}9B1yV
zU?v5}K_=SOpvWE7WD1Iiv^W7q_y`PhYC!7R8m1=LRSvbJPXzfu-E5wzW!~RjBP-V9
zBpT*hms+@32o>uE5*F3l@f6!v7{(&2F)WgTmUU!Hp~KjN2kTH56XK1T7@THOW4Q1%
zsfh@a0cX&}B*X;QU+x(chMxMPR#M!Ttk0sp)WeDT8{UlGcb2vpjaw)dE>;ivGMR)$
zrAVD4y+c<OGf6l{hED_*J5TFnSRL#gq^g&&VHg&_K+7*m`QI&Qqo?o!d!(=ce2Lga
z@b+yJ3HK;Z%^;eWnE8Q~-+PIsJS7)DU2?w6e7b~VP;!~J*MS9KbsVptJ15bIHC|z=
zgXM-?rQHQna`7B0b`8y;LJq}p!>%D!MGb4cMhg~Y@9YM&!iZ9TX<#;Rtj@CQ%vAu3
z{C%BCjo~#nm`@Zi6X~1a24?n6=F<vHt#ganH!<&>w?L6YZ_`9l{yKD<?E$!I&>hhD
zeRpU~C|4ObqDCX+su*+*i70I(XWT2N*O~~m4$~%P%fot~Y+^pX!m#&Ua0B7hUFL=f
z_Q{faOb_xi{XSD9rp~%gHX4z_Q+z<S4mymPq(2}tj^3&<lfDmWiAedR@FD0y!$(Y4
zVI~J3kxQdMB<{)h*`4KMQg6YtJ1S^6$sawYB{|i`?Vq5m5Q+sBQ1gVTC5By^$(f68
zDu$byNx35MjHlq^lwVJ2jS71JGuiMA<gNJ}oUY2Alg&s(q<stR@hZjpN(*!R!^z_N
zg1HroHF^Jn=3R<O%u6N{Ox^X8d^9GaN%AX{h@!COYY;AfO`gUOVKTlIgg>={j$QkP
z=|OCzMjNvuVt98Ovw*NUwr`mVV(P1JX<I2&+D?DRtSe0Y^&PWiV|d+r=EE*5Qu70|
z3NgI@12{lie`KZ)=6&WP%0baY_W6XK&Wk$s?I+NKiJzIqV@))^fH_$F1xz+xJM$qO
zW^%h7jOD_w>{Wn*8Xcg5r5)sxe~~=#{03V2>Ki&aAxfR{9aK>J2WVyG540O1%4FPs
zsCGo*H~%ruSzzOne}W1Q>LiaWiI{V$6AXi27dugy8b`jX3*4>G{RP6dzroWa_kJ^}
zvEGaRfSSnsWuqTWrR$9~Ir*2JA)JF4j`_#V1z^i=c8V~JMK+VL5N{IBN#$%3MTt>j
zApA{?lOs%x6{!&CKoWx`IO~L|u_E%494LFrBsoybyrnqMUGNJj4%7x?`*0Xf+OdL4
z(j3N-Kdi}qX%3Wv&N3XR7@o^;s4X_CeJ_yZFcKDvl#%05=enq+bc-AZ?XU`#QVLd<
z=b*J-WOr<nJcqHLfO!v8;81ft)ietgIMn$UO2MIu9BNU=FrFjI6giArY0TR~i9_w0
zP`rzkIM6iCR^~9y5@RMOm64S~O%uaB6%J!S!<w8_A$=k;(eP9`(1VWis)&h59uw|r
z9O~|cF{8Xz<4}V*HKUAH=P+gztmOrD4s|#dQ)4Xy`*N1{YDvN0`*K$F!jyM&G{{7x
z+k}~z_2bL};F^9&q!2aMWL$qlE#yX=wvYOA7#l5^I&1(EDMXD$b`Ib$ZtXGk22Ey3
zU|3g+vjkvrN{ePZl@(J4a-ajlA7D6P5RIA&gR(Zt&mO%gI9D54DGCoBOoJ&+_7A2p
zp<r_z4t1W5;(bns#)N`BbwRjM7ZrYwCKT^5JsM0gY1czF+e3|E9JEzKnB4>WTT!1O
zQig;#Z16KBZrgJD5#{<EH(_r0vOxx%us(u!9Y`7YszWy%>WKwxbk%7CjyrIbnIWk0
zNkh<gE<<VUgtD(;C@M=K`>;lnj5yQ*bc(voh(=BMZqYEXo%|a{6G>68H>SZ9Zxs`U
z@A#*4@=O>3z|IMuF{qF@V~CvNWD2^p%#<MnU#8Xf(k`Oj3|;1H23}TYMspse+h}tJ
zH@<E+%)tmuv!E3^#pIm@Xkd&b!$3aoZcCabl=IeE(IP<cR<&kE0C$T*Ya~*%6B*jj
z%%ON6wn3|~XfBMl1@*peOH+{41bevd%dwk;t^x9JlpUD0PwYUS&1Hi^I@rv07pG@D
z2a;W*U{}bOt<42xr*mntDZ90Jv}C8?Lp(IA^tgk99qrL%(*sixcy13;FCNZp<JbVX
z5wuQ5@!mOtLmm61VBSbj!K))_no!hp96;Xx9B9#{sIx~gmk6w&l_R*ITydnOHpL{s
ziI&3@{K^TWj&=rJCFz1zr5?vpOm@TYC|5Lhic+_@f_g6>4SG=4jRPG&KkUYwgK*Qg
zb4TU~%__La>fKRR2^ATJeaA3i?AYgHzz{DV%Pe3_E$IPP>Lw3b{!^~%>xqT~QJ?Jd
z1fyf)#e}imXT3nZea6uWloI)194K<OH)zf`Z?GED#)Cl{;=@!B`=r8$mOfMgj`pPu
zdNc)n!L*N>fbI%JT_x{_=08!m)DKLy@e@IFS|`%do>Fj;Kj@Qx{!9<zFk}R97;oNS
z<IN{=Rsjk&O`^2|O61}|TG~^v<Ye0DhFyh4=1xY`LWzt;8chKmTRVl;W+>i1L7>PM
zI!r|;W-1sR>1iPE?bB$~lt`OknmH7Ffeuq5Cxy@yq~Nv?nn((c4+Zl?YdS3lDeA-1
znX$wk92G`O2#UHP4BfB_B^wSyL^$Ft6b20cqQjJx88c|+P_WKSu+5y92`V^t78v3u
zFuWiFTt@#!fF9g3n-(3)c!Nlg$<auV$=Erd-Vf)1j-53Z^vQp7LEh=}Kv;J^jW^}k
zviUT<DR^WQ&4UztD+=ro5eq<{bS^*-AA}MDr_Yv!w6v!@I4m0EeI^=={NxyBvSHre
zVnD|xFQSR0%+Xv7wy>hbAQScyu&v)%Lh~SH&a|Z<_4lPTk(4G$%V-5i!F^+Cno#h*
zSem~nn6sQlO=)szIgOfvC$FIOSPJf30s1>@C1*VlOQ$%_8UTJD$4LX=?eUys03MUT
zNeAFx37kX#-n|Oc#5a+X1W<P-ay9_)!6dK`y01p7lxW7bt_DZBHEX~e)J_JeYm-5d
zlh=YGf3Bq^1Qnh16fl<B>%iKnT1TsSiiz`ju%9%pr!@|oRk%^l-hlM(>E9GAl}bJ+
zr9VT$1$ZD8tOiaR2wzO2wHZpm$?0J2w9{cK?Kf<sxr%}ZZK5%u;3^m%mqDu?in=)i
z<Q=`476yu1Jd@F8@IB<ISF`65z8UBq@<=9!I{HgqhX3v$^<){*LjK()i{mGpemDZB
zvXCXBuJ+$TQ<-w!YZzX#6?C<9HV7AFGvbBcNtz>*ac3T}Gn=D9xaM%Yg;mAjKaoT8
z0HxFTTyXSl&ZRX&iaL^D2<JaS_(_1bc3bnnHZeRO><2INnOzUNcufKGK`Ayze;df8
zVH-GB&)*KVxBi73#s&$#{kl=e*(`kfg+;F20rp?*oy@fqGpXK*TqSxB=e~=4>`yOF
zOx>~zWwQ`9hL`L{g)anSO=S1b+5{Dy0vI;k3+jDgFUTZd9|*sJ;n*T3H4d80egq3y
ziS^DWVbLaTa)8zkD09vo08`WVAd?!4e0vbglhk6+_#q`=`dljk7l_%1(2kU--WsKx
zH~>CT%2^A*Gt0nRyFQ1(A}NDm|8kJYf8}7WNUfmtNXoI=l^}Hm47(m-HZ$z+rX#e)
z2lK{o%ux_lt|GV0=xL7=qJ)Hn9E+K_9V648PL1KmFud?MGucS$+-fF@C1zDKC1Tj@
z1ehilPcVBDrk+(pTaPKf{;C1j`m9>!9D%K|I!QJN5hKr>WHtyaa>^;Df|yCiDe&e!
ztB$ERrnWxK3=Zaf>onND=bZsp46(D!e88<L>n!;Mn7#*%Va<B<T&V}9_Sf(`uczuc
zPQ8D_Zp4=uoI?V7E}`J6b7*B3g0b@M=fRSFbsl`+w&4P}1uA!u31dw*UPM!xs7TFA
zXwOh6Yq2JUmzYd2taBNp-V4J<S7`GKm0C4dm=DFUg3ec&>jRGS{i|dw=&6M@3AqL;
z*m8{neFkb)1Ni9gV*|KHJLfvsQ~q3MGQkQa-$3JqsDjEjnVMkg?3?J>yeReHTTE)~
zlfAc?LBp`sZDxLApPabO><5^7;vL$yFqMN3?vQ$mB>9X+=H4>q{iYH0$%-a$|G2A(
z`2-egvgR&RB<8Jj4~%8zJ#-*IG&Pm()6xf5p<(a)3>Eh|*3yG3u5ajlp8?C2eSkLf
zMNM7%fZYD4djNBiddTsSKG}JV!6g16hdMHhr*X{tI~}IHJO2@Rnt{$6Q@?)%YBK9F
zNd4+Dt>MskKjAPAYh&K6PslR_A|k_^(F`pLKWOGKUa7)NCOt*55T$<jl<6u=J>?k+
zzbN&iXQ0UO&q3a|pM&Ojw16V7wSZQRc>y~191J_Wr1_ib<mi04Pp>%?j6Y*B?iB}p
z#zMrqM_!>_4^f!+8r8EXT=$w*&y;|#6VTD{nXR1Bz$Ld^(aP1s312dsOn3Y_nxr?J
z;nFE`=Nrj%r`WxGL-QO3hfPLO652Rpdf8Da^|6h)AmR9~dQ09DiWuGR9S9$OM=r@C
z)Z^Zh^O6YM_MX=7DJ{`=oV1AC4;)wE60whHiWcRx`6GD4q5p}TnndJQeWIC4$#wtC
z?5WrTH$Q_+LcgH#NR;<~UqDUL+Cfus*zLc9VK4ehyXB&^#4#M-0miVd1GQ*ToqWI1
zgi{f3{f34LQR=ks=w?C)RwdtCv-v^ZEDznx?R{?zckcQh%vAw9&Hq1U&SDscR`Mqp
z+J!$s&q;KG?$7RI#&8n(jV3I>sf!fgr>fZdjV7!r_T0rT<T=qL#qJkvAf=p*<J0yF
zjL*d1V0_SrskDeczd0VlKExVC{b7#O*ejBMY2`r)$F|h`1)XK^53GYD|3DwQchh`G
z3BcBU?S`z&3Fb2H_eo82K60Uh)ahbe#(h0DOIMuB7;!OtM4U?{DWwdyHdKNORsA&y
zE>!iCCArXJhL4h5sOl4>XzDQJ8utO^X7>RF43h>0)JTH@u$O1aaG}<IM}|wacB(RG
z%YqL5BTExNQD?|;p-yBd5BeEbL8CkuS_7OFKmkn(TxboLrwD18tHg!wcQTc@(1AsD
zWiI1bCeD!pWiE8=-#~>+9dD!@a8QNIxUs=Z*s5IU9<C0C-PAw@uc(2V_^E>#*`f{_
zAKjM=4cdSDa-p-G>oq`Ftslr_2Mn9^2lcM%4>EBY0EXf60MN1injr6Inp|iQnyCef
zY}W#LCk*5=CUP9he*-}UQwGue&8TThZLp>bw7Jkb3$}xyWG1c*1`9Mq2P~9!9gupR
zF4#u2^uUZO)C2uxJOt{<#EBuG8{PD|)PoADQf}&Vp`mo50T&wLn+(8M_!}~rXp;lN
zBSVz7NiI*_dItnNrp_FS^nTZP7Eh>@Cf|mFr5tMn)@-*CXwHUVOiggIC>n#wl5I?D
zh?FKqCZJ=Fn}CjWH3dapF{LqKBw~yin2610TxeEaY0hPwq{lY(wcy4JPZStFVZmJu
zz+slqLY->`vM#WKhzWaZ&|TN9LAB@FaN__)6m3E3L$)CGcssB*+w8dL(7DJMzm^Tc
zdK@nFX2p39NIjhkQj7CI>YY50+SwkYer6Bmc=&Lz?pue0)fO=VOy*A`z-GI6B)E3`
z8wqA@vID4~^e8a(vq#Z#oT?vvN6?)8j$kEQIDx!RI)T)#&frRO!x_|jf(sXVa?#`h
zdN9Zp?6YrNX)8OfWYbjC-Ik0-(P_MWVyaco`?I+IevAhFo#X~4vZgzCRv&sEW7b9P
zXuMigbDY82VGOvKG>rilli6cw)jFYwJy?Q#M(}Ga>YFaVqY8SThCL)-22S=saMHc$
zoxLvuV`CINK}T)%q<MsjyPg*_Hs?s*hrCGBGtN8{<~@2GI5XcH2j<jFZ`!&}iTv&j
zhAe43xH`!CFcrjBZuOzn0@bSVOY%0pTxX!uZ1V*tY?}$7P3I?o!Jp;_+Vs(n3q2cI
zKamT)xoGasT>>b0)gSEaGXlWu{2TxVG=378aB6{|f@OhV3-_H&a~0)X`~XdC3Kx2S
zHhT&f)o)X1%2S;1ePUh^xTH!<1#5$t3icd>X&_uV4OGV=7<5`gFgOVYhk#7p!SK>h
zkhjEiFt0aGr#Xfyj{aewIs3wBOei<vbRQEArh7#=I19VY02BD$4BFixB_K2tJ!@P)
zlRFMb*WWY2VP@wn8YjvyZUh&-Tt;u3xQ-u2fNSx}+2C5-KN3vW+DPtNfOo_k+So$b
zgg@XhcrKUm0S~Osnz^(EnYsjT4Gx*dg|-IM=F!4VU4p+TU^Aafy{$+64gcUq>3psO
zptNcfm;gJXz^-n%fYt^nXO}DhsjU}+>vPRQT1rq_qVE!G5i_E>E`XLdqG<&~U4ouw
zL!W<m5(6&j!Hd9|qh%2|7tCA?!fzKdJ-~lRw**B+=m{Z?@9!lv4^Vnn?jpbGvXtuv
zm@2o7*#xkF+-0b|L<N|}(pD2n0DgRQKNfs^<g%RBv8hXNi|kkqw#eWWv=&KSg7OO5
z3yNMzqo$HXGLANEP%se(CZIt)*iLHVLEhd8An%q0kjc_jAQRa{FfVas9ZjTF7Nxv;
z5;%$MNusT16t(?oa8bOwnkJW`p1FqBgDLpan*SZ~yks!ql54?*e#ctcEgTh9+<XF4
zz~*x%h1Ps1oxIn9etx+QY@aFXX$eR%8Mpy-!=VkJi-)JuIs(PydMdaY%uE9V@;eRG
zBr6?+jW&WK+S!d@0()-)^={q-QqRc%jql0;Gj-Eu(BDHcX{kZ^<Y*?SiAxr!$*nBV
z%E&FCCV#eo>*<cIU_DxAgVDK~4Q5qf4(OA%9FTfxE=@ryoAC^(Pk=LIDgn;=>Up4m
z-FaYEaPz@fT+atHZbkvfq^khz4B6X2YLo3?oKJ2CHSsN^IflxL*M%UHr8_{l&rVR2
z?K{CLwAcljb9EOjpo|bL-VKK6(QaCZs7vr5ny?2PMBnV8B`|eK;$Cnl5ZedN#T)lA
zS7E%K4=4hu_ZHDgi4uudVbA?wPd>h%=1}UAaR<22k(0{@z|qC?AZ>i4LVNoln7|>$
z;6(GXm{zNlCRq8+C7|-ZOK8eda<dMB$}5y|{eVGdT`4%ciI;&-w35rX;{hgJBGh<V
zT6vgD{U`-x)3?J+CYX9oIdY89Y=z-Z<><Cq6i%%GVZ%xguCGKl1)@wsk1%1JUw@A<
zLxeS1eH6u6l!<f|TEIo&{V?oyj7z;sje~}Hw;n?`N}^2G9Y<@oC~REKg<j&lS&i<W
zM5*VU02P$0K}9V}on3<l9#L4k7A;?*aA_^N=MaT$Pl9mmNmS3G)b6K1xbYOoWJ(=s
zA)-v4)S-+Og{PfHQ<o_G9}H)n0jW*Sg75_x4ygxo@D~i{o&&?6dmioUi0ZxTJR0^z
zVVw&gyaR@{FM`zDVR+yrbjL3$^2{alluHzzbr~#@4=@~a1*HB7!wFYGYKd!Tu@u!L
z`x?`O_zY*?26Dqp<Vb3E1M2OfO!{93;ezYT#(}GtcLOZ$hd00~j=RaMV%&XnZ_ysS
zP+c>d4pZG;<~DQx4{NgFHfrml=1ATF;iNlg$q=Rf4O7Q9f=s@Nz}TFHP2h_EwF$*i
z^vO`{T_%jp`Fj@>x#AuO|A66``ylnF`=IfW4?ybIFg*PslNuY}{E*8yNQ%t~dc^Ei
znEJ^h=6r#vLmo4EWB4%)PkI7UH^T7vW{~<C47)!Csn3bP*ql+%nC4)({uy%+!RCy3
z4*KNObI^msTR=^Y!?4W@koqtTo4f=QqUa^aMDG;{7mC2xc+J<$;S3v}{TgJV(h9;G
zTR{b--+<J~Fx)KyOKw5GXC2qZ^^(q#iFhR5^Ly5~YkqDcXEs6C#7)VSJUB4(Ef>Aw
zH*(g3-h%_!%13XRH6IX+sHeW;&gf%UT_Y~E57+vR%Q*RriNoKM*(0E)PHeZnhp9i1
zcPJvfKYn1QAm+XBBY86>LM`!$cAG<q-1Le0fDJQI_)Lz-B1|@YrrqabPqn8Y+tF{%
zXc3$*Tq|km(K0^0dwSSi8DE$-;Y;M(kr#u~`%u*5$z8fl?WEi-tPtT{I;F2*=gj_!
zo|KBVcFhhD-qC^lB}%RLjk%O!y?2waXb;!>j`C0xF8mHo)p|crfeVSmymykYC=>1f
znDv9H^Zo;Q5BiB5E5sXDTmDb5e)@KTgJo_fQ$ft6Zx_>pcgZ;CcOjpQSh&WrH^jJp
zG=GuN5t%x6{z6Ztg%rfpM!(62jv~|*znLu(YhwKewDRyDrrwcc5t{uasnf%62v=e8
zUuLCX-e&(mk!Amwym1&zyO}wN;mU5>y>0J3PtIr_W0WO{bI$V^amRW%i}9euydcJ7
z+^k@M-r_vQ@m0*^xj2&=dplBsH%r*tn8^<b9+lm=8{ltAqYaXNM7ku;UHEgCSU{H~
zx<n|Pn9~X=rcD_BDTQnjP4Sg|c+g47e|>n2^IuqGvNVE)L}Hs{WJo6T48kHe%J3Ld
z7p7K|<uTrC!qnSknVMi~T{+}NA>LETzEmtnTKOa~s<$s~A`8Mu9>MW{hPIgXbp2G4
zx=5b%u0z(&UTSQPp#tc^A_X3Fv~Z{*sfoz;V!0v@I!t1%#G_6x_og9nL5W9w50DzH
zeU!oIG%E8Ls}=T%j|vaHUqtseHs`VmGi|YBJym&7hrFbUDniH{%*0I%87~xK3|~@X
zMjpdn>O95=0<g%7>ZF2nSLu>raOjKRB**W=dc%OJkM(65k9qSnQ2GdIf~ikyF!jc;
zLqDX6kVwp=x*yXg7#`jq1x=Lqss7B!V}CmgAYr<{`I-y>D{Z7ElL@A-gW*wHAd^#C
z%*bOV_5;CK9vjH49XZn9_Jhc@_dBL4@b_Q|60I48>M!Z#z2Uu4z%Fss1|xh$8?hGh
z6n4qD!OXJ6`rRAMEO`u1)!{K7abhO-b;!Vr_;rddm=Ddm%$Q-lL-d$^7Q<~KFxDh`
z2*~@}5GHR-y;PrB+nD!H5*G3ahF2JX@NWaAV=?a}LuQR&xYLjsN({#jMNCA~Tg(V$
zkB}yqI?;$(3>fY*;z395Q-?8g4V%-)7zIs~_iAGv^h)6`V~|O#2})Q|Cf`h$j>QSF
z)RZg+k%ahW$_yGdXPFt9-t;bm;s4B-^@n-Knlm$&v@+L%sUNmI-hvrQ%=#Y*i%Lwg
z<OK>H*W(jgGJ}9wFSi0i_8$ogvBuO(tU>AyYfzCTHX!_;4Y+b8+JdPjVF$W3*^cQ}
zY^5X{WU_(H3>h{@p2MsK3}?cy2A3HH97%%9Y{!_%03Op-7|!MKpohlV_9$P45))Gw
z*n^9X?r?C?**TmCy*Xku0yMs41k-rzLCcZM$YZ!{B-rYW9hg~#srNd7VK5p6#`1s&
zjEy&P1bwp45!Bn*i765@DRbg69=l@&?VZ6KJm(B%>}VHeH^EHmU1(vTo>n@!(pnOR
zu_pKoRFf;u0f;kiG^pi;(I{DZst|L+8Rz9jszdKfSiluG<}iohiS9h=!?qL?yt^hg
zhKJto65$jzhQ~PSgXM~kMVv&bbH_5Nv0PIRbZa4+(kDE4jIWLGwe;jMzVeUZ51u^e
z6S-@>K#`i`c+e)$@o_W{P{XgMHwZuT=0UqA%g2LytNYNXsT{zsQ(pGrF<z&{X~XkH
zHuXdmtAo9J$CroRhoi>=!;uqc3x!bl7&q0JI@XUFA`Ht<B-d*JZ%Rv?k|h&SN(!li
zwN&;;*(3@V_>(t`BCDKH07_s{_-FtvfvNk8;gi5JJ2i>M8*4d@yo<u;!e#{WT%`vr
zRVwMdmBRUPGZ5X^^duR+1mCElw*}G7k;so8y`KzD94n_V1A;xE9)wa-)TTp0yk!9F
zF%^W{rt%mkpO2GMZ}K#hsjTZwjlEMZ7O5M|gI<@p5X@T+h@2V1O8{WmP#*M7OKB)C
zS(qAoFmO7L@fks^$-n8mRl?L5-WSGO4Zz;vh>57HzK8Qt0C@Kd9`ve<>r7DZH#0%r
zn`ZG=0K6?CKv&(2U`7WUKYupTq$ho-{8Wkr9eXqq6zM+)H0QrLJjM&@*n>Ofg1lYl
zflONGG4;kwlIMe(7({`_Ux;GnAZ9Xs0qSU?LHoS`<h^|%sNkq*W>#U|@1j9XQe!~g
zCX1LME+vcP&LT3|<lbHqE|QgtL6L?_Km{)^0YyeG1!37`U=Efp1H<4Mi<-Zv2Vcj6
znygz6GBH>IhT+T#FlbX(f{y*Yk_WxwUJysN8If+yjR!Tk7Z3VmSpp+tmG+?M#BLmt
zJ4K7dT3SpPt|Hwq_D`*FoX)J`X^H1-C%(q>d<c_7o|=#*x@4X#c(G&-Pmaa9YR_VM
zunbuI|LCIsyL*(pZY*&nk*6$log~h$?nK0U@o7HIuG6N{#M~sF0zkMU2@(FeNfNr}
zu4d@K=QVsaB01R7Ln2A+g6STUbO*j+I%4D+UVlJb+!{o4L4r_C;kgrEB+O)q5=}0o
zmsT<&w3Fi#hRT`AsmrlQyICwT)&Q1h$ZARY@fv*kL2G&HLc*~#WhQGD?2uux9wI-E
z6LRJ~9eiD`u9?jL6r)4>)p^hInBmea)*5XV%Ug)WQ$i($F<@h%3q!w8KGK)PN?pog
zO%S?bjTK+nGAlDhP%61xV;<OS!T-4T@=!s@UN@45BcYwjlO?vT10#HE9rBE{JB`rJ
z-AtAExSm0cMLDS+n^uK<v`U}Fn$#;w7l|4>fySYEJ#UbZgZY`Fy8(%c4x$kj2ARnb
z85?M#K&!lZMUht3MfP&g0;GhdA}K@X^wKC1S5kR;y@Ytm!9}2&h6o?V5%p=j(V&OP
z6%EIAWj@jHK!U|`My6A`kY@9UvuTWB3_EIlR_?4ORGGVxqsI4YPhPP+2@53lr1J*!
zDn&}d*BuX<*=eA`VjXp1v8D)JSDntkt|i^9KM^&9<}zyG-Lw%|<hPqo_-P}rpO66l
zcTP<=A)16T!g@2$lPE1SlOggqA!eeh_zjXUyPQvGo59l*VurOnq1kadMVZBVY{p{w
z2x;qogn!-R3`VJ8e=bdqRvL)>nc0WM3KY7|ov_->gTDH4dNb1I{zX1lgG?G%Dxl=c
zN8%4IlaaS4e6E;D5jB~hpFWX<)>nE7RsQer$4$^tDn_cRqwEt7<}+e<7I|wxTOzQ5
zp|Z~B(6$Kci4Ke9DP-U`!g&kh(F=BE>9$7k;Zle{k|CU8b#1+RN)at>X3B)=R;E>W
z&=1{;teXCbq-GNpTT!#ND^j2aew_2~Ng9`qUYb6Hb2hK9kOBN!O~^)sf4=hxZ)P)7
z2{Tj3K{TzMB#kA})oCXGf4lMHrtGFVlr8Zb)S!i&vVf?};W-Lvft|9>sx2d1oW;^a
z&UX;HVud^rlZT{kk~dc&cuzROYXrV{nBSCsH^f-13Y2KV7e^}bFOF9-|NlgO*q9YN
z1|S9QOR`vwLJDLO69}I0fmST_&)bd964Y6&ZZ{Tdq|g;o>ZDXFLQCCT>Hl`r&;w@4
zVMuddBzcq&k7gn|k0<<9H=J*U&6@^Hm1VKsqJ?L;&=vW)`QTl2Og@U;I2%H*0I5`$
z&(jt1K4w~CIRE=L<h*al`@&2|5xNB+*Q^4>)!K>g-sS>E&){d4_BKTG&XZ5`*VFue
zXAO3!<a$H(y~r7RkS2CQre*r^d30=}bpxsynr}yPQvCTe`TqYuIrFYAi%38+Udl7m
zWnv(o$LzrWBVG6@w{MK8Dhk?b5nWBE@_9H;HJ2vx3TgF=+ZVp;CeOr+X9N@0yLh(5
zz#Sxw$St_f4rJ6*Of&L-CK2DyPe@XB2+3|V5fWnlzegmj`|)r2SDMt&$`iwES^vWM
zJRa?2bPAj$=hT!fYEXu_=`*jhC#-hSS{>D=$+zdkgA%Zzxx0BPLaB$9VW~~>?1%EX
z!iL3i6$*M;?`4ui`feUH0Ng;ZxWW1!_L79|9$JD>YJ9qk==yHObi}l^nhO6PhrZu~
z=oixH3AepGlV0V88H?Q`ocHmZ{wGUa^1;&DD*b(FXeKy}{wv({txEZ7W!9N15dLFH
zJ86%$u#djovmNsZwNvSlOvme^)t$^Cbg?QiuFPD9FfF2G991HrMaZJ%b-mP@pt_V{
zIB@a}e0tCQP~=h_ib7OTI!L1_S~zkcg6ozwn5z+Qv)Dk(wkRx-y{X7zmD{pd)Fewz
zO}Fpxg}%LGu0g~dpjks{hvzx*7CzzTcgS|-gEYe48O!yyQrRvv$z4ZtxA624PQN(4
z1&KPr<iM}3j|Y+CV&C<Of*sc@(Q|vW`bA{d3nS)zT2*&14^`r6H);-FiXojtYiEzI
zLHb@pcY0%m3Zg}k_*#a7jSWsN0dLXjN|3tULuiEBhrrSaMX<Q05smQ7A=>QRYfx3}
z$uMb@dr?-*+>5fLT^w1;=omP0wdS3X7DxVn+K-v_&7)`+OP298fZ`&fEH;K{FXOp^
z*-Gk&8&jI{7(EX(g3AqKv4VtLc7VK^`dA#5L8}iV^CSKEO58ro)9ICuEhnFO`C2Y}
zYJ#SLEy#e$LR_y2u1oT>P$yQDqZL(jrminX0_Fzt1*lZ;Y=s2yb8B7&qWLtdm!>aq
zvx4D9KA~JCBIM4c5k^&lbJ`)2Fl;f6Q0WLGPjFyHTugAPLsM%GdM+V6=X%A_E<SXG
z5g2~UMpfGLALm}jkoRj#lJUj44=q%PgV!0^%{L|PDAKU0m#!btaul2$^r{fuimiNQ
zqN{jjLjJ>ge3DLhvI%8Y&S)0PuU8LUQjd}zJ^B&G#~9Ba@U?P}a`wujYgbKWIv}Q=
zx^`#1h4lYSJ<O*myhiq(0*kel%VJFw(&6oc9zH{e6~{qelpRMlTD11isSpa)v|6Q-
zbv%N_rQc#YBI&J#8c|-&V2G9BBnRvyP{uw%gN?95BB_h@2&<vA2Q8G8=Icnx!cTlD
z?Y}Mh5g$)5r0_$iTZ0Ji3J6tddHsoXHMC`yEFeB%JxN&5&gX~-jfG;!rwOY?G|$kp
zBC@kp)Y4Yj-WFt8IzsObnzbrXAqwAn7If0Ioqdu~JJ=_Co7L0rp@y8`&0>WJ1?b`L
z-iyr%ol^``@x>CwiYNA?#E$)cbzFH^mBk(gL_k2`$fmLiQX<KAP|pX5h?s_Hq-iCV
zn%yjM0n_V-p?T|CS}rMLB{wt640nApT)_>OjslY6h6{;`qTm9CqW7H{{mwUY4u6=>
zlkace<~QHKP%Eo6fjHEWloS|ZNJhQ^qhptJsAi_+?1p9id^j$3x1o~->sL}tN-)RN
z?zZTmUVDIfzmrDZFMBLmy6xi7v_c$5osf>An_lazA)eJa!hyWD*OIyS2AzK|6pkK*
z?NsD2D_q0j`d*EAnm?=wKko?iZ_@%ryMj9@E-)lDTStt#ZchBCIS{)Iqeg5<8I;<z
zc$@N-9_p+ok&HsatAb}4zvWH(%sn_(F2GtYPEjMJNF+V?No|3r<0t#z?7Chgvk;eQ
zD+We5l7J-m>*hYmUmP8<ALwZVRkV)G->(6hbfsT8ZDO&Z6dB^0bp+>`asWIRGiVa|
z^?;-$OUwEDzwiG*XJs{Kx@=t42r43pTs&y7BmEB=Tm;W>0do#QK(9$EP`sOZX4oNM
zewfKIvkqydfT_B7P)y!u9MiT)!mub*>IVg%LQ1}lloU^Sh+C27ses8-?5MtO5m{`+
z)keZ$i&HM^FeKYA=9r2S1SxEBgfCH!YJ9JDj{;*!>Bb=3MdV`57HYUUN5jE|M7TG@
zVeJ9OtQ}|}o4U$v)qaPDL)eZ8Pje33pW}~0V_ZWqtM(j~+6D8~)3v{k`3@Po2#dWq
zNe++>$LP~r&D9g<ns*FbPn^@}V22*A0!>b3=jGyR3QQIUGsCbi-?aT(N0LQGLyDN8
znvdZD6sawA(<(y?eHo=5ChRYaqGH#%p!4|mG8Fi5Ap}n-#Xu5yD?&$dC(-a}b{|`&
zSHW=@`9=-ZeE<nPVenTmCfz5V0A|u{ig}fsK0&t?Gd`}xz*IcLxZXjmniEKYv9+98
zdToRkNhr~1My0Zqu6aMaQ%8wkA>A9gS=6Ky0uDXHOfEG9soI(>+CwpKJ>-09A4arX
z7D|V<nN`D20=LhE^J>m2r!oLrKjI8C$_&9GGj_|jlVaL6QA{%N`O6TZrjlAschXYw
z>TpN18pmyMB4%5Ys8fc)B=($Cvn;z8Y*-jP2H_tSCDYv?k*B1G_^inp>Ss^73rV%8
z32}Ej(uzyUIWHY;c~zvJh82;{9CP-xgkikDP~zq;!JzvE)8-u^A<Gp`=N1bN3(GAv
zl!eFpyyXGnd?+Sv8&~5gaWZk~5!sf+oso#Sb`#G)yUd;(c;bxYV~+7X3yf6~$3$K*
zxRC{CC8u)WQ3|}%R|7QjKC?G=*s;|8IPP!jA{StOiiY^Q#n2fJ-<aM9_729WQrv^;
zNw4z;3%^F36rYD)n}?{Zk>O^|YsLj&X3bYIZlwBx<~s>11(ua`c3S^RiYpAw--EFn
z8@+d=hYo#cZeM)A5u*B(Ql|<^Fu%-lD|s=*#mZ`WqC7|^t|TQDhF;|7i_%<UbOk=+
zTE2>db~&Ua3z8Z)aJs;akuAkGs&POdT`xa8**e?}vk`Cnt)c`Tziy>Y2_?HPA?VLH
zOXd$>`pSp?k}@0n?HrSM*`TxUmv;TV=dXR3e(#={IwK#v6Of|fwesuF=!(CMYDwPT
z9@(aOC8_gE?1SQmVR+Ye#s@G#$q#mN1sJ0XR}2m$?TV#aqCS1*4IKhkWg~k`$HG7~
zPp`uw2WbVT&Htcc<unpLZio0J3gQp{sUe<FNe8N?HwI7#C;#<b%|YBxdHGsd=>_7?
zNlB$4Np&qd(lFP?{SDWy_?xqy{Wop}2G6e&UAwAb@7Z-|WI1RZqDmh3Rh67bm-~?}
zWO}010$6aiG!E}rk9vwg;Kh40|3(dmwO0*(LdNosdbh7a#VJi1U_up+<ImHC*}~@L
zj4A*l9Y|pf)|8`Fbp6nHi1&~7*MPb0!ZBm7Y08iBVdeK5#TdOPCXPG_j<O^6*QG#n
zW<9S1=;WsY?Md!+OU*a^(cyp=u>Hada?w=;bK>wD1|QLMrb*@v2<RKeF^6wR9g&?h
zs_U%grQ)bI_D%WvZ8GUlP2bd-`QXQDa1HClx%TfB)%+J5ejbUv3H<0G9KZM`^_dyZ
z5{l7};+Rgi<WJ=MMb#}}){NnpqA^izNUW_Qew}6M;*-+m)GE|5=^<ah{bxKU9$6!0
z3hRbHEhrs#1cg_CZ_G(T`hQIdYBUdT*=(mOjkvq_^uN(HCPSL|EMj{WmDH1*%ZhR(
zJ6p?YQsS?3OOujNZH>6c?<0sK_zYfSh$j(uu*;a|>4ZD*L;QbLj6E^lp*`2ksgAV(
zdM_c?4-7qsvLvcC$*7g<i)Q|oJ{>B<VE80-9Pyz=r)8XYWgWJo)MZi9Cqa!wcEH;o
zxB9dPO5F#SWPJsb_I;}|lETkId#MhZ#{EDs31si;C`Xcf7bPRu&VX5W0bH?K1-hH(
z0Izz#djNiv$AMGtp{xXEtl&3OO#clO)0?>6r#CVhi+GLpzYomH-4v5d^6pFJ$x3K;
z(biReV7dG;U(UBJIS(v8;GgpVOiNF3<<^~#a%j$6pLzhlIIjYA7R+UpzJmfQt4L-8
zcG<QK@(8BB>%Y$HUPnZEzS(N&SxxDrpI%`es%Ze*+}re@lgWUG7PIWjhrn!mz$@!A
z!)PWq_oDwlz;u1eF;o7LniZSwq2qrq>3~{lgLAUD`8wH>5@#ijxHjV6Q-~r<iPT2$
zUS!WPo9v@okxh-raxIXx1>;h~06)y2lT+n-+uDi~$33#x&u2b@0DU`-DSf25$}#nP
zm&d?N?8q?-J4V}+agU`^<9WG~0@rupKx3C^%sg9sp%=8JX8kqve8VWXV=Auj#Os&}
zJttn&1UCwFG-QM8Puu*2C@(-y7oS=E++C$}BE{X|r-Ua|MNP`HK$Nd{+w?C;J`2fW
zAhyPH4l5F)TX~0%&dAWdE7NMiO?7+<Q?n9P8cY2;3(x(^x+N^~!TDz6Rr$cP;$=?!
z<SDJYfs>a!e+9@C{bXVKn$a+|o40&=^%%qGmMv5)%=u-<6ug#6=bUF}M%$6c>CsNS
zu*{s%aAb9hGPv^K0yzvLGgS_*#Aq#bc<upf8;JTYTLn6i(Kd>w5HwyND=BcqQVz^s
z8r}RzWtQhV<!rD<h<ccX=)+vjV^IsmQmvG<fE*)E>a?DK!Cq)9E5wRwf-QjlYdA1r
zO|&C9VR6>pBJ+B`15J!*D&k>vMjj^~)>4sPXHo(4SN2OAiX+_^3_vlxV>WU+w=D=t
zeM`mC6bQG2cD0*TpofLF$jmTOpzC%HjMxq{!tJF<F^A5X+tz&uRB4aPPw|ZUMLs9a
zw^!VSz%jbzmusqQV8&&*KwO~W3pm~If@tw!q_$4j(DbWJy4MY99D?2?)?4mA&ZEKs
z!PKTE&#r)0Fs^$M$Glk-tt0cirN+mzYbOQz9p%6tN28sIzazY$-7uLZb(cTZr{E0m
zRkA$D4CN}Z0~uM4-ocJuf=+B;+;)Vh>Pn98<AnM%$INj8ro}amDR)xv98%MRxXQlH
zz;vmmm_#zo89A!emTfC%w$7fiAHM&@?C>>-6gw;8?=@f(U{B_w-`2yv8~B7JzC8P(
zj!H@(>su?P+d$UcSsLBJ|F+rI#4)e9DB`J%Wx|8{efkfO1;a7E#07VV4H@N*a=R$g
z@hJMcg14WYim@kIu2M-d0fE`Jq3<G1W}2N&2Rn0OH#don3mD)A0Xm(EX-U?~Kz592
zTQ{xdAS@_>KgBJpkXY-KL1L7cUuNpS`kF6g9Z&o_=<SG$AO3SuC)XG~qs)!GcWed>
zs6{&!OTH;k<>2xxvCK{o>)Sxog%Ao%CavAk7KDVL$!A~Key1-^pF!}D_~Kh3Bi$8C
ztIQ>LaNX6Fa_vK+ZE3n{B{L&uc>rVAmtuO5Qyz*=Xan5GuAad3A3zE{l_WCWQ)*hw
zz4b*i{-{EW>D^J*u9_5h%Gxo>iC*B^{cY}?<z7l#K{CsVoGVYB3_{RWA+5y*v?Gnu
z#gPbac?dGz>E7U-K7wQBkI*|2qqh__zP(wu1@PQx4tz9P-`c$NS@T%yxvJY^gxA?$
zZewl6a^l*yavNhUBdQ$)Oq!@-@EUuf-co~MdLO6gvdJ7B(q1v$_o!Q$(Kxw5flrFz
z6LGQf$>MZzS$b#E?*+M8G69Y4Az*tp2cFK>TR3vF_OrL$rzzXfe(-bV(zaMhHv1^1
z-_Xh02(z@aFIYFu<E(kMX&WCCznfzISi&*oOQ2_<pA;7s_))E1nfeXf{S_{I#ALr{
z<it;ndN;CDCgwrC;|D2cN>!kJ&PlyErnD`Y#dpfs*d$uMt?<%D+>XDw!ueeCr)^qm
z5wqGa0J6qk<Cw3mA=9!0==9M5nSk7<D~dKFY_(XWM2qroabm|nc~&q10|OzTq*lc^
zlcriE&^Q}O3NO{$6z$PS(c#3<K}sMN+C{;i5AKWPAt6>)al#tAi7f7*q^M;`e|-&`
zEPffQ;v;ZFNB@X7XQ>V(suLs)2*UY6+guo3u2tj#f<v4RO+kFXS^SKyG)QhrY~OPU
zh86ib&ihYYjE;O1OwBdxUbz%F)SCmZ1xpixIld-ne~B|Pc>|m;DsiJ7r<>4G9^_1k
z-#UVKUtf+H<{RTlE<{SNjrlb15dz@vAsl!lB*vTkFGSLoC3R`F%XIoFy%$nI%rW0V
zl(;MDHYmoG1f)on%Np$bOJSD=!I$3n94H<|kCQnsD5l@DrZ$RcPyR{Q!{}I0iI={!
zMsjieUX*7&HT<Q|8Y1e^${le}D!yZOexK8ozer0#6AYGD<@MDSo#E*AC?nCl;Uuz?
z)H0Yu_LuuknE{F!kk(De#H#UBQUtl*NeU@1@TgFDd|L*`WQ8iY7}eApk4hoMd_0+A
z!iiI7B}CAe<>%(WruTNiL*b|f@tt^&HJmPWO^mMj8*y5nR^x(>CgJg{v7gN2jXcg{
zOdj(0QJl1lvOBqJo>#_yPs4+tiU)Ii4GdFWGgmoraN*A0aPSixGsHBgKE%Zpgei_>
z_)AiRn7vO&ts7v3xC*SJVr0L>3r*^$++#7)C7344jEBB#?};=#lzRDXc_eQx<2*jO
z8RJ0yEQ_)HvrBAVETuoBxT6aKlzvks@*?fSQ8u%ed9HiD2=kscFdV?TIu2YOF6OK|
PITEh8*o|+BL38>)u$+lc

diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
index 4e54e17b3ef..be0357e5319 100644
--- a/backends/metax_gpu/tests/ignore.txt
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -23,3 +23,10 @@ test_conv3d_transpose_op
 test_conv3d_layer
 test_conv3d_transpose_part2_op
 test_fused_conv2d_add_act_op
+test_swiglu_metax
+test_set_value_op
+test_pad_op
+test_squared_l2_norm_op
+test_concat_op
+test_dygraph_spectral_norm
+test_bincount_op

From 53f4bdeb04b6a2d47a2da4d04718302eb3f6a58b Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Fri, 17 Oct 2025 13:35:00 +0800
Subject: [PATCH 78/95] updata paddle (#110)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* [metax]change_cupti_and_fix_softmax

* [metax]change_patch

* [metax]change_patch

* [metax] updata_qr_kernel

* [metax] updata_qr_kernel

* [Metax] fix cufft and fix some blas kernel apply

* [metax] fix bug

* [Metax] add github action

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]chaneg build

* [metax]fix_code style and index_elementwise_put_kernel

* [metax]change_build

* [metax]change_build

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_metax_work

* change_warpctc.cmake

* change warpctc.cmake

* test

* change_run_ut

* remove_tets

* test

* add_generate_pb

* [metax]fix paddle bug

* change_ut

* change_ut

* change_ut

* [metax]fix patch and fix missing kernel

* [metax] link mccl and fix missing kernel

* [metax] rename yaml file

* [metax] rm file

* [metax] rm file

* [metax] add Rules

* [metax] change_patch

* update paddle

* [metax] fix dot error

* [metax]rm opt path and fix activation_kernel bug

* updata paddle

* chang_meatx_yaml

* chang_meatx_yaml

* updata_metax

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* test

* updata_enigen

* updata_paddle

---------

Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
Co-authored-by: ZhouDuan <1184319564@qq.com>
---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index 89f4bd92f49..fd95abaec01 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 89f4bd92f49e15a9e1803a9e582526b2b8e4557d
+Subproject commit fd95abaec0133b2e2f0ab83684925cd62a18150d

From bf3074e5fdd7962b08aa6673baf42dcb6ca90025 Mon Sep 17 00:00:00 2001
From: duqimeng <1640472053@qq.com>
Date: Fri, 17 Oct 2025 18:16:07 +0800
Subject: [PATCH 79/95] test

---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index fd95abaec01..5dbecdcb0e4 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit fd95abaec0133b2e2f0ab83684925cd62a18150d
+Subproject commit 5dbecdcb0e4ddd3488927f49082dfb66c794f9e7

From 8a54b1d850770680759095280a7c500abcc10c05 Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Mon, 20 Oct 2025 15:07:32 +0800
Subject: [PATCH 80/95] [metax] modify kernels (#117)

* modify kernels

* modify kernels
---
 backends/metax_gpu/CMakeLists.txt             |   24 +-
 .../cuda_kernels/argsort_kernel_register.cu   |    2 +-
 .../cuda_kernels/batch_fc_kernel_register.cu  |    2 +-
 .../matmul_grad_kernel_register.cu            |    2 +-
 .../cuda_kernels/matmul_kernel_register.cu    |    2 +-
 .../cuda_kernels/multihead_matmul_kernel.cu   |    2 +-
 .../kernels/dynload/cupti_lib_path.h          |   19 -
 .../kernels/dynload/dynamic_loader.cc         |  938 -----
 .../kernels/dynload/dynamic_loader.h          |   61 -
 .../kernels/funcs/affine_grid_utils.h         |    2 +-
 backends/metax_gpu/kernels/funcs/blas/blas.cc |   59 -
 backends/metax_gpu/kernels/funcs/blas/blas.h  |  631 ----
 .../kernels/funcs/blas/blas_impl.cu.h         | 3027 -----------------
 .../metax_gpu/kernels/funcs/blas/blas_impl.h  | 2003 -----------
 .../kernels/funcs/blas/blaslt_gemm_search.h   |  794 -----
 .../kernels/funcs/blas/blaslt_impl.cu.h       | 1137 -------
 .../metax_gpu/kernels/funcs/blas/cublas.cc    |   40 -
 .../metax_gpu/kernels/funcs/blas/cublas.h     |  148 -
 .../metax_gpu/kernels/funcs/blas/cublasLt.cc  |   27 -
 .../metax_gpu/kernels/funcs/blas/cublasLt.h   |  115 -
 .../metax_gpu/kernels/funcs/blas/cublaslt.h   |  328 --
 backends/metax_gpu/kernels/funcs/blas/port.cc |  163 -
 backends/metax_gpu/kernels/funcs/blas/port.h  |   61 -
 .../metax_gpu/kernels/funcs/layer_norm_util.h |    2 +-
 .../metax_gpu/kernels/funcs/quant_dequant.h   |  430 ---
 backends/metax_gpu/kernels/gpudnn/cudnn.cc    |   78 -
 backends/metax_gpu/kernels/gpudnn/cudnn.h     |  218 --
 .../kernels/impl/addmm_kernel_impl.h          |    2 +-
 .../kernels/impl/baddbmm_kernel_impl.h        |    2 +-
 .../kernels/impl/bilinear_grad_kernel_impl.h  |    2 +-
 .../kernels/impl/bilinear_kernel_impl.h       |    2 +-
 .../kernels/impl/bmm_grad_kernel_impl.h       |    4 +-
 .../metax_gpu/kernels/impl/bmm_kernel_impl.h  |    2 +-
 .../kernels/impl/cholesky_grad_kernel_impl.h  |    2 +-
 .../impl/cholesky_solve_grad_kernel_impl.h    |    2 +-
 .../kernels/impl/conv_grad_kernel_impl.h      |    2 +-
 .../metax_gpu/kernels/impl/conv_kernel_impl.h |    2 +-
 .../kernels/impl/conv_transpose_kernel_impl.h |    2 +-
 .../impl/deformable_conv_grad_kernel_impl.h   |    2 +-
 backends/metax_gpu/kernels/impl/elementwise.h |    2 +-
 .../kernels/impl/flatten2_kernel_impl.h       |    2 +-
 .../kernels/impl/gru_unit_kernel_impl.h       |    2 +-
 .../kernels/impl/index_select_impl.h          |    2 +-
 .../kernels/impl/inverse_grad_kernel_impl.h   |    2 +-
 .../metax_gpu/kernels/impl/lstm_kernel_impl.h |    2 +-
 .../kernels/impl/lu_grad_kernel_impl.h        |    2 +-
 .../kernels/impl/lu_solve_grad_kernel_impl.h  |    4 +-
 .../kernels/impl/matmul_grad_kernel_impl.h    | 2042 -----------
 .../kernels/impl/matmul_kernel_impl.h         | 1717 ----------
 .../kernels/impl/matmul_kernel_impl_maca.h    | 1696 ---------
 .../kernels/impl/multi_dot_kernel_impl.h      |    2 +-
 .../metax_gpu/kernels/impl/mv_kernel_impl.h   |    2 +-
 .../kernels/impl/solve_grad_kernel_impl.h     |    2 +-
 .../impl/triangular_solve_grad_kernel_impl.h  |    2 +-
 .../batch_fc_grad_kernel_register.cu          |    2 +-
 .../kernels/metax_kernel/block_attn.h         |    2 +-
 .../kernels/metax_kernel/elementwise.h        |    2 +-
 .../kernels/metax_kernel/metax_context.h      |    4 +-
 .../metax_kernel/mv_grad_kernel_register.cu   |    2 +-
 .../kernels/metax_kernel/quant_dequant.h      |    2 +-
 .../rank_attention_grad_kernel_register.cu    |    4 +-
 .../rank_attention_kernel_register.cu         |    4 +-
 .../slogdeterminant_kernel_register.cu        |    2 +-
 .../triangular_solve_kernel_register.cu       |    2 +-
 backends/metax_gpu/patch/paddle.patch         |  487 +--
 backends/metax_gpu/runtime/runtime.cc         |    2 +-
 66 files changed, 210 insertions(+), 16127 deletions(-)
 delete mode 100644 backends/metax_gpu/kernels/dynload/cupti_lib_path.h
 delete mode 100644 backends/metax_gpu/kernels/dynload/dynamic_loader.cc
 delete mode 100644 backends/metax_gpu/kernels/dynload/dynamic_loader.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/blas.cc
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/blas.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/blas_impl.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/blaslt_gemm_search.h
 delete mode 100755 backends/metax_gpu/kernels/funcs/blas/blaslt_impl.cu.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/cublas.cc
 delete mode 100755 backends/metax_gpu/kernels/funcs/blas/cublas.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/cublasLt.cc
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/cublasLt.h
 delete mode 100755 backends/metax_gpu/kernels/funcs/blas/cublaslt.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/port.cc
 delete mode 100644 backends/metax_gpu/kernels/funcs/blas/port.h
 delete mode 100644 backends/metax_gpu/kernels/funcs/quant_dequant.h
 delete mode 100644 backends/metax_gpu/kernels/gpudnn/cudnn.cc
 delete mode 100644 backends/metax_gpu/kernels/gpudnn/cudnn.h
 delete mode 100644 backends/metax_gpu/kernels/impl/matmul_grad_kernel_impl.h
 delete mode 100755 backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
 delete mode 100644 backends/metax_gpu/kernels/impl/matmul_kernel_impl_maca.h

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 6aecdc1f833..9e257e9507d 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -109,6 +109,10 @@ file(
   CUDA_SRCS
   # backends
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/gpu/cuda/cuda_info.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/dynamic_loader.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublas.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cublasLt.cc
+  ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cudnn.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cuda_driver.cc
   ${PADDLE_SOURCE_DIR}/paddle/phi/backends/gpu/cuda/cuda_graph.cc
   # Core
@@ -698,7 +702,6 @@ file(
   kernels/gpudnn/*.cu
   kernels/cuda_kernels/*.cc
   kernels/cuda_kernels/*.cu
-  kernels/funcs/blas/*.cc
   kernels/ernie_core/*.cu)
 
 set(CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS})
@@ -746,11 +749,28 @@ target_compile_definitions(
   PUBLIC PADDLE_WITH_CUDA=1
          PADDLE_WITH_CUSTOM_DEVICE=1
          mcblasContext=cublasContext
+         cublasLtContext=mcblasLtContext
          GPUContext=CustomContext
          KPSContext=CustomContext
          STREAM_TYPE=cudaStream_t
          EVENT_TYPE=cudaEvent_t
-         EIGEN_USE_GPU=1)
+         EIGEN_USE_GPU=1
+         CUDA_LIB_NAME="libmcruntime.so"
+         BLAS_LIB_NAME="libmcblas.so"
+         BLASLT_LIB_NAME="libmcblasLt.so"
+         DNN_LIB_NAME="libmcdnn.so"
+         PTI_LIB_NAME="libmcpti.so"
+         RAND_LIB_NAME="libcurand.so"
+         JPEG_LIB_NAME="libnvjpeg.so"
+         SOLVER_LIB_NAME="libmcsolver.so"
+         SPARSE_LIB_NAME="libmcsparse.so"
+         RTC_LIB_NAME="libmcruntime.so"
+         FLASHATTN_LIB_NAME="libmcFlashAttn.so"
+         FLASHATTNV3_LIB_NAME="libflashattnv3.so"
+         CCL_LIB_NAME="libmccl.so"
+         FFT_LIB_NAME="libcufft.so"
+         SPARSELT_LIB_NAME="libcusparseLt.so"
+         CUPTI_LIB_PATH="/root/cu-bridge/CUDA_DIR/extras/CUPTI/lib64")
 
 # packing wheel package
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
diff --git a/backends/metax_gpu/kernels/cuda_kernels/argsort_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/argsort_kernel_register.cu
index 8fb331eeedd..20ea33834e6 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/argsort_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/argsort_kernel_register.cu
@@ -26,11 +26,11 @@
 namespace cub = hipcub;
 #endif
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/primitive/functor_primitives.h"
 #include "paddle/phi/kernels/transpose_kernel.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/batch_fc_kernel_register.cu
index caccb01f71d..0e82304d31d 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/batch_fc_kernel_register.cu
@@ -14,10 +14,10 @@
 
 #include <string>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/cuda_kernels/matmul_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/matmul_grad_kernel_register.cu
index f9eef9908ab..bb3b07d24d0 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/matmul_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/matmul_grad_kernel_register.cu
@@ -13,9 +13,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "../impl/matmul_grad_kernel_impl.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 #include "paddle/phi/kernels/matmul_grad_kernel.h"
 
 PD_CUSTOM_KERNEL_REGISTER(matmul_grad,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/matmul_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/matmul_kernel_register.cu
index 57c3a85b1ea..750cf2a9f36 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/matmul_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/matmul_kernel_register.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "kernels/impl/matmul_kernel_impl.h"
+#include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
index 151c929e41c..998854140fc 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/multihead_matmul_kernel.cu
@@ -15,11 +15,11 @@
 #include <algorithm>
 #include <type_traits>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/common/errors.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/dynload/cupti_lib_path.h b/backends/metax_gpu/kernels/dynload/cupti_lib_path.h
deleted file mode 100644
index 6082fffd60e..00000000000
--- a/backends/metax_gpu/kernels/dynload/cupti_lib_path.h
+++ /dev/null
@@ -1,19 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#define CUPTI_LIB_PATH "/root/cu-bridge/CUDA_DIR/extras/CUPTI/lib64"
diff --git a/backends/metax_gpu/kernels/dynload/dynamic_loader.cc b/backends/metax_gpu/kernels/dynload/dynamic_loader.cc
deleted file mode 100644
index a23b7fa2aff..00000000000
--- a/backends/metax_gpu/kernels/dynload/dynamic_loader.cc
+++ /dev/null
@@ -1,938 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-// #include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "kernels/dynload/dynamic_loader.h"
-
-#include <dirent.h>
-
-#include <codecvt>
-#include <cstdlib>
-#include <string>
-#include <vector>
-// #include "paddle/phi/backends/dynload/cupti_lib_path.h"
-#include "./dynload/cupti_lib_path.h"
-#include "paddle/phi/common/port.h"
-#include "paddle/phi/core/enforce.h"
-
-#if defined(_WIN32)
-#include <windows.h>
-#endif
-
-// TODO(wilber): The phi computing library requires a component to manage flags
-// (maybe not use gflags).
-#include "glog/logging.h"
-#include "paddle/common/flags.h"
-
-COMMON_DECLARE_string(cudnn_dir);
-COMMON_DECLARE_string(cuda_dir);
-COMMON_DECLARE_string(cublas_dir);
-COMMON_DECLARE_string(nccl_dir);
-COMMON_DECLARE_string(cupti_dir);
-COMMON_DECLARE_string(tensorrt_dir);
-COMMON_DECLARE_string(mklml_dir);
-COMMON_DECLARE_string(lapack_dir);
-COMMON_DECLARE_string(mkl_dir);
-COMMON_DECLARE_string(op_dir);
-COMMON_DECLARE_string(cusparselt_dir);
-COMMON_DECLARE_string(curand_dir);
-COMMON_DECLARE_string(cusolver_dir);
-COMMON_DECLARE_string(cusparse_dir);
-COMMON_DECLARE_string(win_cuda_bin_dir);
-#ifdef PADDLE_WITH_HIP
-
-PHI_DEFINE_string(miopen_dir,
-                  "",
-                  "Specify path for loading libMIOpen.so. For instance, "
-                  "/opt/rocm/miopen/lib. If empty [default], dlopen "
-                  "will search miopen from LD_LIBRARY_PATH");
-
-PHI_DEFINE_string(rocm_dir,
-                  "",
-                  "Specify path for loading rocm library, such as librocblas, "
-                  "libmiopen, libhipsparse. For instance, /opt/rocm/lib. "
-                  "If default, dlopen will search rocm from LD_LIBRARY_PATH");
-
-PHI_DEFINE_string(rccl_dir,
-                  "",
-                  "Specify path for loading rccl library, such as librccl.so. "
-                  "For instance, /opt/rocm/rccl/lib. If default, "
-                  "dlopen will search rccl from LD_LIBRARY_PATH");
-#endif
-
-// #ifdef PADDLE_WITH_FLAGCX
-// COMMON_DECLARE_string(flagcx_dir);
-// #endif
-
-// PHI_DEFINE_EXPORTED_string(
-//     flagcx_dir,  // NOLINT
-//     "",
-//     "Specify path for loading libflagcx.so. For instance, "
-//     "For instance, /usr/local/flagcx/lib. If default, "
-//     "dlopen will search flagcx from LD_LIBRARY_PATH");
-
-#ifdef PADDLE_WITH_XPU
-PD_DEFINE_string(xpti_dir, "", "Specify path for loading libxpti.so.");
-#endif
-
-namespace phi::dynload {
-
-struct PathNode {
-  PathNode() = default;
-  std::string path = "";
-};
-
-static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;  // NOLINT
-
-// NOTE: In order to adapt to the default installation path of cuda
-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin";
-#else
-static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64";  // NOLINT
-#endif
-
-static PathNode s_py_site_pkg_path;
-
-#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll";
-static constexpr char* win_cublas_lib =
-    "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll";
-#if CUDA_VERSION >= 11000
-static constexpr char* win_curand_lib =
-    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll";
-static constexpr char* win_nvjpeg_lib =
-    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll";
-static constexpr char* win_cusolver_lib =
-    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusolver64_" CUDA_VERSION_MAJOR
-    ".dll;cusolver64_11.dll;cusolver64_10.dll";
-static constexpr char* win_cusparse_lib =
-    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll;cusparse64_10.dll";
-static constexpr char* win_cufft_lib =
-    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll;cufft64_11.dll;cufft64_10.dll";
-#else
-static constexpr char* win_curand_lib =
-    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_nvjpeg_lib =
-    "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cusolver_lib =
-    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cusparse_lib =
-    "cusparse64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cusparse64_" CUDA_VERSION_MAJOR ".dll";
-static constexpr char* win_cufft_lib =
-    "cufft64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
-    ".dll;cufft64_" CUDA_VERSION_MAJOR ".dll";
-#endif  // CUDA_VERSION
-#endif
-
-static inline std::string join(const std::string& part1,
-                               const std::string& part2) {
-// directory separator
-#if defined(_WIN32)
-  const char sep = '\\';
-#else
-  const char sep = '/';
-#endif
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline std::vector<std::string> split(
-    const std::string& str, const std::string separator = " ") {
-  std::vector<std::string> str_list;
-  std::string::size_type firstPos = 0;
-  firstPos = str.find_first_not_of(separator, 0);
-  std::string::size_type lastPos = 0;
-  lastPos = str.find_first_of(separator, firstPos);
-  while (std::string::npos != firstPos && std::string::npos != lastPos) {
-    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
-    firstPos = str.find_first_not_of(separator, lastPos);
-    lastPos = str.find_first_of(separator, firstPos);
-  }
-  if (std::string::npos == lastPos) {
-    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
-  }
-  return str_list;
-}
-
-void SetPaddleLibPath(const std::string& py_site_pkg_path) {
-  s_py_site_pkg_path.path = py_site_pkg_path;
-  VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
-}
-
-static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
-                                                 const std::string& dso_name,
-                                                 int dynload_flags) {
-  void* dso_handle = nullptr;
-  if (!spec_path.empty()) {
-    // search xxx.so from custom path
-    VLOG(3) << "Try to find library: " << dso_name
-            << " from specific path: " << spec_path;
-    std::string dso_path = join(spec_path, dso_name);
-    dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-  }
-  return dso_handle;
-}
-
-static inline std::string FindLibAbsolutePath(const std::string& directory,
-                                              const std::string& filename) {
-  DIR* dir = opendir(directory.c_str());
-  struct dirent* ent;
-
-  if (dir != nullptr) {
-    while ((ent = readdir(dir)) != nullptr) {
-      if (ent->d_type == DT_REG || ent->d_type == DT_LNK) {
-        if (filename == std::string(ent->d_name)) {
-          closedir(dir);
-          return join(directory, ent->d_name);
-        }
-      } else if (ent->d_type == DT_DIR) {
-        if (strcmp(ent->d_name, ".") != 0 && strcmp(ent->d_name, "..") != 0) {
-          std::string res =
-              FindLibAbsolutePath(join(directory, ent->d_name) + "/", filename);
-          if (!res.empty()) {
-            closedir(dir);
-            return res;
-          }
-        }
-      }
-    }
-    closedir(dir);
-  }
-  return "";
-}
-
-static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
-                                                int dynload_flags) {
-  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
-  // and /usr/local/lib path
-  void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-  VLOG(3) << "Try to find library: " << dso_path
-          << " from default system path.";
-
-// TODO(chenweihang): This path is used to search which libs?
-// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
-// bring System Integrity Projection (SIP), if dso_handle
-// is null, search from default package path in Mac OS.
-#if defined(__APPLE__) || defined(__OSX__)
-#if defined(__arm__) || defined(__aarch64__)
-  if (nullptr == dso_handle) {
-    dso_handle =
-        dlopen(FindLibAbsolutePath("/opt/homebrew/Cellar/", dso_path).c_str(),
-               dynload_flags);
-  }
-#else
-  if (nullptr == dso_handle) {
-    dso_handle =
-        dlopen(FindLibAbsolutePath("/usr/local/cuda/lib/", dso_path).c_str(),
-               dynload_flags);
-  }
-#endif
-#endif
-
-  return dso_handle;
-}
-
-/*
- * We define three priorities for dynamic library search:
- *
- * First: Search for  path specified by the user
- * Second: Search the stheystem default path
- * Third: Search for a special path corresponding to
- *        a specific library to adapt to changes and easy to expand.
- */
-
-static inline void* GetDsoHandleFromSearchPath(
-    const std::string& config_path,
-    const std::string& dso_name,
-    bool throw_on_error = true,
-    const std::vector<std::string>& extra_paths = std::vector<std::string>(),
-    const std::string& warning_msg = std::string()) {
-#if !defined(_WIN32)
-  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
-#else
-  int dynload_flags = 0;
-#endif  // !_WIN32
-#if defined(_WIN32)
-  std::vector<std::wstring> cuda_bin_search_path = {
-      L"cublas",
-      L"cuda_nvrtc",
-      L"cuda_runtime",
-      L"cudnn",
-      L"cufft",
-      L"curand",
-      L"cusolver",
-      L"cusparse",
-      L"nvjitlink",
-  };
-  for (auto search_path : cuda_bin_search_path) {
-    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
-    std::wstring win_path_wstring =
-        converter.from_bytes(FLAGS_win_cuda_bin_dir);
-    search_path = win_path_wstring + L"\\" + search_path + L"\\bin";
-    AddDllDirectory(search_path.c_str());
-  }
-#endif
-  std::vector<std::string> dso_names = split(dso_name, ";");
-  void* dso_handle = nullptr;
-  for (auto const& dso : dso_names) {
-    // 1. search in user config path by FLAGS
-    dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
-    // 2. search in system default path
-    if (nullptr == dso_handle) {
-      dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
-    }
-    // 3. search in extra paths
-    if (nullptr == dso_handle) {
-      for (auto const& path : extra_paths) {
-        VLOG(3) << "extra_paths: " << path;
-        dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
-      }
-    }
-    if (nullptr != dso_handle) break;
-  }
-
-  // 4. [If Failed for All dso_names] logging warning if exists
-  if (nullptr == dso_handle && !warning_msg.empty()) {
-    LOG(WARNING) << warning_msg;
-  }
-
-  // 5. [If Failed for All dso_names] logging or throw error info
-  if (nullptr == dso_handle) {
-    auto error_msg =
-        "The third-party dynamic library (%s) that Paddle depends on is not "
-        "configured correctly. (error code is %s)\n"
-        "  Suggestions:\n"
-        "  1. Check if the third-party dynamic library (e.g. CUDA, CUDNN) "
-        "is installed correctly and its version is matched with paddlepaddle "
-        "you installed.\n"
-        "  2. Configure third-party dynamic library environment variables as "
-        "follows:\n"
-        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
-        "  - Windows: set PATH by `set PATH=XXX;%%PATH%%`\n"
-        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
-        "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
-        "impossible unless System Integrity Protection (SIP) is disabled.]";
-#if !defined(_WIN32)
-    auto errorno = dlerror();
-#else
-    auto errorno = GetLastError();
-#endif  // !_WIN32
-    if (throw_on_error) {
-      // NOTE: Special error report case, no need to change its format
-      PADDLE_THROW(
-          common::errors::PreconditionNotMet(error_msg, dso_name, errorno));
-    } else {
-      LOG(WARNING) << paddle::string::Sprintf(error_msg, dso_name, errorno);
-    }
-  }
-
-  return dso_handle;
-}
-
-void* GetCublasDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_11.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublas64_12.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-
-#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.11");
-#else
-    // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=11000-12000 start" ;
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblas.so");
-    // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=11000-12000 end" ;
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublas.so.12");
-#else
-    // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=12000-13000 start" ;
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblas.so");
-    // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=12000-13000 end" ;
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocblas.so");
-#else
-  // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=else start" ;
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblas.so");
-  // VLOG(0) << "dynload:libmcblas.so: CUDA_VERSION=else end" ;
-//   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblas.so");
-#endif
-}
-
-void* GetCublasLtDsoHandle() {
-// APIs available after CUDA 10.1
-#if defined(__linux__) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.11");
-#else
-    // return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblasLt.so");
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so.12");
-#else
-    // return GetDsoHandleFromSearchPath(FLAGS_cublas_dir, "libcublasLt.so");
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcblasLt.so");
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_11.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cublasLt64_12.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cublas_lib, true, {cuda_lib_path});
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#elif !defined(__linux__) && defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10010
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhipblaslt.so");
-#else
-  std::string warning_msg(
-      "Your CUDA_VERSION less 10.1, not support CublasLt. "
-      "If you want to use CublasLt, please upgrade CUDA and rebuild "
-      "PaddlePaddle.");
-  return nullptr;
-#endif
-}
-
-void* GetCUDNNDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  std::string mac_warn_meg(
-      "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
-      "For instance, sudo tar -xzf "
-      "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
-      "chmod a+r /usr/local/cuda/include/cudnn.h "
-      "/usr/local/cuda/lib/libcudnn*");
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cudnn_dir, "libcudnn.dylib", false, {}, mac_warn_meg);
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  std::string win_warn_meg(
-      "Note: [Recommend] copy cudnn into CUDA installation directory. \n "
-      "For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from "
-      "NVIDIA's official website, \n"
-      "then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing "
-      "Toolkit\\CUDA\\v10.0\n"
-      "You should do this according to your CUDA installation directory and "
-      "CUDNN version.");
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12030) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, "cudnn64_8.dll", true, {cuda_lib_path}, win_warn_meg);
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
-#endif
-  } else if (CUDA_VERSION >= 12030) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, "cudnn64_9.dll", true, {cuda_lib_path}, win_warn_meg);
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cudnn_lib, true, {cuda_lib_path}, win_warn_meg);
-#endif
-  }
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_miopen_dir, "libMIOpen.so", false);
-#else
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  if (CUDA_VERSION >= 12030) {
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cudnn_dir, "libcudnn.so.9", false, {cuda_lib_path});
-  } else {
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cudnn_dir, "libcudnn.so.8", false, {cuda_lib_path});
-  }
-#else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cudnn_dir, "libmcdnn.so", false, {cuda_lib_path});
-#endif
-#endif
-}
-
-void* GetCUPTIDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cupti_dir, "libcupti.dylib", false, {cupti_lib_path});
-#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cupti_dir, "libcupti.so.11.8", false, {cupti_lib_path});
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cupti_dir, "libmcpti.so", false, {cupti_lib_path});
-#endif
-
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cupti_dir, "libcupti.so.12", false, {cupti_lib_path});
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cupti_dir, "libmcpti.so", false, {cupti_lib_path});
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cupti_dir, "libmcpti.so", false, {cupti_lib_path});
-#endif
-}
-
-void* GetCurandDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, "curand64_10.dll", true, {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, win_curand_lib, true, {cuda_lib_path});
-#endif
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhiprand.so");
-#else
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so.10");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_curand_dir, "libcurand.so");
-#endif
-
-#endif
-}
-
-#ifdef PADDLE_WITH_HIP
-void* GetROCFFTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocfft.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libhipfft.so");
-#endif
-}
-#endif
-
-void* GetNvjpegDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, win_nvjpeg_lib, true, {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so");
-#endif
-}
-
-void* GetCusolverDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, "cusolver64_11.dll", true, {cuda_lib_path});
-#else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_cuda_dir, win_cusolver_lib, true, {cuda_lib_path});
-#endif
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsolver.so");
-#else
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so.11");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcsolver.so");
-#endif
-#endif
-}
-
-void* GetCusparseDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.dylib");
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_11.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cusparse64_12.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.11");
-#else
-    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libmcsparse.so");
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libcusparse.so.12");
-#else
-    return GetDsoHandleFromSearchPath(FLAGS_cusparse_dir, "libmcsparse.so");
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 12, paddle "
-        "temporarily no longer.");
-    return nullptr;
-  }
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcsparse.so");
-#endif
-}
-
-void* GetNVRTCDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib", false);
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcruntime.so", false);
-#endif
-}
-
-void* GetCUDADsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib", false);
-#elif defined(PADDLE_WITH_HIP)
-  return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "libamdhip64.so", false);
-#elif defined(_WIN32)
-  char system32_dir[MAX_PATH];
-  GetSystemDirectory(system32_dir, MAX_PATH);
-  return GetDsoHandleFromSearchPath(system32_dir, "nvcuda.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libmcruntime.so", false);
-#endif
-}
-
-void* GetWarpCTCDsoHandle() {
-  std::string warpctc_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    warpctc_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(warpctc_dir, "warpctc.dll");
-#else
-  return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.so");
-#endif
-}
-
-void* GetWarpRNNTDsoHandle() {
-  std::string warprnnt_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    warprnnt_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(warprnnt_dir, "libwarprnnt.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(warprnnt_dir, "warprnnt.dll");
-#else
-  return GetDsoHandleFromSearchPath(warprnnt_dir, "libwarprnnt.so");
-#endif
-}
-
-void* GetFlashAttnDsoHandle() {
-  std::string flashattn_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    flashattn_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattn.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(flashattn_dir, "flashattn.dll");
-#else
-  return GetDsoHandleFromSearchPath(flashattn_dir, "libmcFlashAttn.so");
-#endif
-}
-
-void* GetFlashAttnV3DsoHandle() {
-  std::string flashattn_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    flashattn_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattnv3.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(flashattn_dir, "flashattnv3.dll");
-#else
-  return GetDsoHandleFromSearchPath(flashattn_dir, "libflashattnv3.so");
-#endif
-}
-
-void* GetAfsApiDsoHandle() {
-  std::string afsapi_dir = "";
-  if (!s_py_site_pkg_path.path.empty()) {
-    afsapi_dir = s_py_site_pkg_path.path;
-  }
-#if defined(__APPLE__) || defined(__OSX__) || defined(_WIN32)
-  return NULL;
-#else
-  return GetDsoHandleFromSearchPath(afsapi_dir, "libafs-api-so.so");
-#endif
-}
-
-void* GetNCCLDsoHandle() {
-#ifdef PADDLE_WITH_HIP
-  std::string warning_msg(
-      "You may need to install 'rccl' from ROCM official website: "
-      "https://rocmdocs.amd.com/en/latest/Installation_Guide/"
-      "Installation-Guide.html before install PaddlePaddle.");
-#else
-  std::string warning_msg(
-      "You may need to install 'nccl2' from NVIDIA official website: "
-      "https://developer.nvidia.com/nccl/nccl-download "
-      "before install PaddlePaddle.");
-#endif
-
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_nccl_dir, "libnccl.dylib", true, {}, warning_msg);
-#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL)
-  return GetDsoHandleFromSearchPath(
-      FLAGS_rccl_dir, "librccl.so", true, {}, warning_msg);
-#else
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-  return GetDsoHandleFromSearchPath(
-      FLAGS_nccl_dir, "libnccl.so;libnccl.so.2", true, {}, warning_msg);
-#else
-  return GetDsoHandleFromSearchPath(
-      FLAGS_nccl_dir, "libmccl.so", true, {}, warning_msg);
-#endif
-
-#endif
-}
-
-// void* GetFLAGCXDsoHandle() {
-// #ifdef PADDLE_WITH_FLAGCX
-//   return GetDsoHandleFromSearchPath(FLAGS_flagcx_dir, "libflagcx.so");
-// #else
-//   return nullptr;
-// #endif
-// }
-
-void* GetTensorRtDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so");
-#endif
-}
-
-void* GetMKLMLDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "mklml.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "libmklml_intel.so");
-#endif
-}
-
-void* GetLAPACKDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-#if defined(__arm__) || defined(__aarch64__)
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dylib");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.3.dylib");
-#endif
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so.3");
-#endif
-}
-
-void* GetOpDsoHandle(const std::string& dso_name) {
-  return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
-}
-
-void* GetNvtxDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  PADDLE_THROW(common::errors::Unimplemented("Nvtx do not support Apple."));
-#elif defined(_WIN32)
-  PADDLE_THROW(common::errors::Unimplemented("Nvtx do not support Windows."));
-#elif !defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW(
-      common::errors::Unimplemented("Nvtx do not support without CUDA."));
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvToolsExt.so");
-#endif
-}
-
-void* GetCUFFTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.dylib");
-#elif defined(__linux__) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.10");
-#else
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so.11");
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer.");
-    return nullptr;
-  }
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  if (CUDA_VERSION >= 11000 && CUDA_VERSION < 12000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_10.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
-#endif
-  } else if (CUDA_VERSION >= 12000 && CUDA_VERSION < 13000) {
-#ifdef PADDLE_WITH_PIP_CUDA_LIBRARIES
-    return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "cufft64_11.dll");
-#else
-    return GetDsoHandleFromSearchPath(
-        FLAGS_cuda_dir, win_cufft_lib, true, {cuda_lib_path});
-#endif
-  } else {
-    std::string warning_msg(
-        "Your CUDA_VERSION is less than 11 or greater than 13, paddle "
-        "temporarily no longer supports");
-    return nullptr;
-  }
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcufft.so");
-#endif
-}
-
-void* GetMKLRTDsoHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.dylib");
-#elif defined(_WIN32)
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "mkl_rt.dll");
-#else
-  return GetDsoHandleFromSearchPath(FLAGS_mkl_dir, "libmkl_rt.so");
-#endif
-}
-
-void* GetCusparseLtDsoHandle() {
-// APIs available after CUDA 11.2
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11020 && 0
-  return GetDsoHandleFromSearchPath(FLAGS_cusparselt_dir, "libcusparseLt.so");
-#else
-  std::string warning_msg(
-      "Your CUDA_VERSION less 11.2, not support cusparseLt. "
-      "If you want to use cusparseLt, please upgrade CUDA and rebuild "
-      "PaddlePaddle.");
-  return nullptr;
-#endif
-}
-
-void* GetXPTIDsoHandle() {
-#ifdef PADDLE_WITH_XPTI
-  return GetDsoHandleFromSearchPath(FLAGS_xpti_dir, "libxpti.so");
-#else
-  return nullptr;
-#endif
-}
-}  // namespace phi::dynload
diff --git a/backends/metax_gpu/kernels/dynload/dynamic_loader.h b/backends/metax_gpu/kernels/dynload/dynamic_loader.h
deleted file mode 100644
index a5d3d0ff76c..00000000000
--- a/backends/metax_gpu/kernels/dynload/dynamic_loader.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-
-#include "paddle/utils/test_macros.h"
-namespace phi {
-namespace dynload {
-
-#ifndef _WIN32
-#define DECLARE_TYPE(__name, ...) decltype(__name(__VA_ARGS__))
-#else
-#define DECLARE_TYPE(__name, ...) decltype(auto)
-#endif
-
-void* GetCublasDsoHandle();
-void* GetCublasLtDsoHandle();
-TEST_API void* GetCUDNNDsoHandle();
-void* GetCUPTIDsoHandle();
-void* GetCurandDsoHandle();
-void* GetNvjpegDsoHandle();
-void* GetCusolverDsoHandle();
-void* GetCusparseDsoHandle();
-void* GetNVRTCDsoHandle();
-void* GetCUDADsoHandle();
-void* GetWarpCTCDsoHandle();
-void* GetWarpRNNTDsoHandle();
-void* GetFlashAttnDsoHandle();
-void* GetFlashAttnV3DsoHandle();
-void* GetNCCLDsoHandle();
-// void* GetFLAGCXDsoHandle();
-void* GetTensorRtDsoHandle();
-void* GetMKLMLDsoHandle();
-void* GetLAPACKDsoHandle();
-void* GetOpDsoHandle(const std::string& dso_name);
-void* GetNvtxDsoHandle();
-void* GetCUFFTDsoHandle();
-void* GetMKLRTDsoHandle();
-void* GetROCFFTDsoHandle();
-void* GetCusparseLtDsoHandle();
-void* GetXPTIDsoHandle();
-void* GetAfsApiDsoHandle();
-
-void SetPaddleLibPath(const std::string&);
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/affine_grid_utils.h b/backends/metax_gpu/kernels/funcs/affine_grid_utils.h
index c137d9ad468..b973d75a9be 100644
--- a/backends/metax_gpu/kernels/funcs/affine_grid_utils.h
+++ b/backends/metax_gpu/kernels/funcs/affine_grid_utils.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/backends/metax_gpu/kernels/funcs/blas/blas.cc b/backends/metax_gpu/kernels/funcs/blas/blas.cc
deleted file mode 100644
index 098a0400552..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blas.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// clang-format off
-#include "funcs/blas/blas.h" // NOLINT
-#include "paddle/phi/core/enforce.h"
-// clang-format on
-namespace phi {
-namespace funcs {
-MatDescriptor CreateMatrixDescriptor(const DDim &tensor_dim,
-                                     int num_flatten_cols,
-                                     bool trans) {
-  PADDLE_ENFORCE_GT(
-      tensor_dim.size(),
-      1,
-      phi::errors::InvalidArgument("The tensor dim size should be greater "
-                                   "than 1, but reveived dim size is %d",
-                                   tensor_dim.size()));
-  MatDescriptor retv;
-  if (num_flatten_cols > 1) {
-    auto flatten_dim = common::flatten_to_2d(tensor_dim, num_flatten_cols);
-    retv.height_ = flatten_dim[0];
-    retv.width_ = flatten_dim[1];
-  } else {
-    if (tensor_dim.size() == 2) {
-      retv.height_ = tensor_dim[0];
-      retv.width_ = tensor_dim[1];
-    } else {
-      auto dim_vec = common::vectorize(tensor_dim);
-      retv.batch_size_ = 1;
-      for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
-        retv.batch_size_ *= dim_vec[i];
-      }
-      retv.height_ = dim_vec[dim_vec.size() - 2];
-      retv.width_ = dim_vec[dim_vec.size() - 1];
-      retv.stride_ = retv.height_ * retv.width_;
-    }
-  }
-  if (trans) {
-    std::swap(retv.width_, retv.height_);
-  }
-  retv.trans_ = trans;
-  return retv;
-}
-}  // namespace funcs
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/blas.h b/backends/metax_gpu/kernels/funcs/blas/blas.h
deleted file mode 100644
index 75ea8c921e2..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blas.h
+++ /dev/null
@@ -1,631 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/phi/backends/dynload/mklml.h"
-#endif
-
-#ifdef PADDLE_WITH_LIBXSMM
-#include <libxsmm.h>
-#endif
-
-#if defined(PADDLE_USE_OPENBLAS) || defined(PADDLE_USE_REFERENCE_CBLAS)
-#include <cblas.h>
-#endif
-// #include "paddle/phi/core/enforce_metax.h"
-namespace phi {
-namespace funcs {
-
-/**
- * Matrix Descriptor of a memory buffer.
- *
- * It is used for Blas::MatMul. MatMul operator can be batched.
- * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
- * `batch_size` times of GEMM. The batched GEMM could be faster base on the
- * implementation of the blas library. The batch size could be zero. If any
- * matrix of `matmul` has a batch size, there will be a batched GEMM, too. e.g.,
- * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
- * [BatchSize, H1, W2]
- *
- * The boolean flag, `trans`, describe the memory is the transpose of matrix or
- * not. If the trans is true, the last two dims of matrix are transposed. The
- * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
- *
- * The MatDescriptor is not only the dimension or shape of a matrix, it also
- * contains the layout, stride of matrix. It is clearer to have a structure than
- * reuse `DDim`.
- */
-struct MatDescriptor {
-  int64_t height_;
-  int64_t width_;
-  int64_t stride_{0};
-  int64_t batch_size_{0};
-  bool trans_;
-};
-
-/**
- * Create Matrix Descriptor from a tensor dim, num_flatten_cols, and transpose
- * flag
- *
- * @param tensor_dim: The dimension of the tensor. The rank of this dimension
- * must larger than 1.
- *
- * @param num_flatten_cols:  Reshape a tensor to a matrix. The matrix's first
- * dimension(column length) will be the product of tensor's first `num_col_dims`
- * dimensions. If num_flatten_cols is zero, the first N-2 dimension will be the
- * batch_size of descriptor.
- *
- * @param trans: True if the matrix is transposed.
- */
-extern MatDescriptor CreateMatrixDescriptor(const DDim& tensor_dim,
-                                            int num_flatten_cols,
-                                            bool trans);
-
-template <typename DeviceContext>
-class Blas {
- public:
-  explicit Blas(const DeviceContext& context) : dev_ctx_(context) {}
-
-  template <typename T>
-  void GEMM(CBLAS_TRANSPOSE transA,
-            CBLAS_TRANSPOSE transB,
-            int64_t M,
-            int64_t N,
-            int64_t K,
-            T alpha,
-            const T* A,
-            const T* B,
-            T beta,
-            T* C) const;
-
-  template <typename T, typename U = T>
-  void GEMM(CBLAS_TRANSPOSE transA,
-            CBLAS_TRANSPOSE transB,
-            int64_t M,
-            int64_t N,
-            int64_t K,
-            U alpha,
-            const T* A,
-            const T* B,
-            U beta,
-            T* C) const;
-
-  template <typename T>
-  void GEMM(bool transA,
-            bool transB,
-            int M,
-            int N,
-            int K,
-            T alpha,
-            const T* A,
-            int lda,
-            const T* B,
-            int ldb,
-            T beta,
-            T* C,
-            int ldc) const;
-
-  template <typename T>
-  void GEMM(CBLAS_TRANSPOSE transA,
-            CBLAS_TRANSPOSE transB,
-            int M,
-            int N,
-            int K,
-            T alpha,
-            const T* A,
-            int lda,
-            const T* B,
-            int ldb,
-            T beta,
-            T* C,
-            int ldc) const;
-
-#ifdef PADDLE_WITH_MKLML  // @{ Group MKLML: class Blas
-  template <typename T>
-  T* GEMM_ALLOC(const CBLAS_IDENTIFIER id,
-                const int M,
-                const int N,
-                const int K) const;
-
-  template <typename T>
-  void GEMM_PACK(const CBLAS_IDENTIFIER id,
-                 const CBLAS_TRANSPOSE trans,
-                 int M,
-                 int N,
-                 int K,
-                 const T alpha,
-                 const T* src,
-                 const int ld,
-                 T* dst) const;
-
-  template <typename T>
-  void GEMM_COMPUTE(int transA,
-                    int transB,
-                    int M,
-                    int N,
-                    int K,
-                    const T* A,
-                    const int lda,
-                    const T* B,
-                    const int ldb,
-                    T beta,
-                    T* C,
-                    const int ldc) const;
-
-  template <typename T>
-  void GEMM_FREE(T* data) const;
-
-  template <typename T>
-  void CSRMM(const char* transa,
-             const int* m,
-             const int* n,
-             const int* k,
-             const T* alpha,
-             const char* matdescra,
-             const T* val,
-             const int* indx,
-             const int* pntrb,
-             const int* pntre,
-             const T* b,
-             const int* ldb,
-             const T* beta,
-             T* c,
-             const int* ldc) const;
-
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-  template <typename T>
-  void MatMulWithHead(const phi::DenseTensor& mat_a,
-                      const MatDescriptor& dim_a,
-                      const phi::DenseTensor& mat_b,
-                      const MatDescriptor& dim_b,
-                      T alpha,
-                      int head_number,
-                      phi::DenseTensor* mat_out,
-                      T beta,
-                      bool mat_y_split_vertical) const;
-#endif
-#endif  // @} End Group MKLML: class Blas
-
-  template <typename T>
-  void MatMul(const int M,
-              const int N,
-              const int K,
-              const T* A,
-              const T* B,
-              T* C) const;
-
-  template <typename T>
-  void MatMul(const phi::DenseTensor& mat_a,
-              bool trans_a,
-              const phi::DenseTensor& mat_b,
-              bool trans_b,
-              T alpha,
-              phi::DenseTensor* mat_out,
-              T beta) const;
-
-  template <typename T>
-  void MatMul(const phi::DenseTensor& mat_a,
-              bool trans_a,
-              const phi::DenseTensor& mat_b,
-              bool trans_b,
-              phi::DenseTensor* mat_out) const {
-    MatMul(mat_a,
-           trans_a,
-           mat_b,
-           trans_b,
-           static_cast<T>(1.0),
-           mat_out,
-           static_cast<T>(0.0));
-  }
-
-  template <typename T>
-  void MatMul(const phi::DenseTensor& mat_a,
-              const phi::DenseTensor& mat_b,
-              phi::DenseTensor* mat_out) const {
-    this->template MatMul<T>(mat_a, false, mat_b, false, mat_out);
-  }
-
-  template <typename T>
-  void AXPY(int n, T alpha, const T* x, T* y) const;
-
-  template <typename T>
-  void VADD(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VSUB(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VMUL(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VDIV(int n, const T* x, const T* y, T* z) const;
-
-  template <typename T>
-  void VCOPY(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VEXP(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VSQUARE(int n, const T* x, T* y) const;
-
-  template <typename T>
-  void VPOW(int n, const T* x, T alpha, T* y) const;
-
-  template <typename T>
-  void GEMV(bool trans_a,
-            int M,
-            int N,
-            T alpha,
-            const T* A,
-            const T* B,
-            T beta,
-            T* C) const;
-
-  template <typename T>
-  T DOT(int n, const T* x, const T* y) const;
-
-  template <typename T>
-  void CUDOT(
-      int n, const T* x, int incx, const T* y, int incy, T* result) const;
-  template <typename T>
-  void SCAL(int n, const T a, T* x) const;
-
-  template <typename T>
-  T ASUM(int n, T* x, int inc) const;
-
-  template <typename T>
-  void BatchedGEMM(CBLAS_TRANSPOSE transA,
-                   CBLAS_TRANSPOSE transB,
-                   int64_t M,
-                   int64_t N,
-                   int64_t K,
-                   T alpha,
-                   const T* A,
-                   const T* B,
-                   T beta,
-                   T* C,
-                   int64_t batchCount,
-                   int64_t strideA,
-                   int64_t strideB) const;
-
-  template <typename T, typename U = T>
-  void BatchedGEMM(CBLAS_TRANSPOSE transA,
-                   CBLAS_TRANSPOSE transB,
-                   int64_t M,
-                   int64_t N,
-                   int64_t K,
-                   U alpha,
-                   const T* A,
-                   const T* B,
-                   U beta,
-                   T* C,
-                   int64_t batchCount,
-                   int64_t strideA,
-                   int64_t strideB) const;
-
-  template <typename T>
-  void BatchedGEMM(CBLAS_TRANSPOSE transA,
-                   CBLAS_TRANSPOSE transB,
-                   int M,
-                   int N,
-                   int K,
-                   T alpha,
-                   const T** A,
-                   const T** B,
-                   T beta,
-                   T** C,
-                   int batchCount) const;
-
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
-  template <typename T>
-  void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
-                           CBLAS_TRANSPOSE transB,
-                           int W1,
-                           int H1,
-                           int W2,
-                           int H2,
-                           T alpha,
-                           const T* A,
-                           const T* B,
-                           T beta,
-                           T* C,
-                           int batchCount,
-                           int64_t strideA,
-                           int64_t strideB,
-                           int64_t head_number,
-                           bool split_b_vertical) const;
-#endif
-
-  template <typename T>
-  void MatMul(const phi::DenseTensor& mat_a,
-              const MatDescriptor& dim_a,
-              const phi::DenseTensor& mat_b,
-              const MatDescriptor& dim_b,
-              T alpha,
-              phi::DenseTensor* mat_out,
-              T beta) const;
-
-  template <typename T>
-  void MatMul(const T* mat_a,
-              const MatDescriptor& dim_a,
-              const T* mat_b,
-              const MatDescriptor& dim_b,
-              T alpha,
-              T* mat_out,
-              T beta) const;
-
-  template <typename T>
-  void VINV(int n, const T* a, T* y) const;
-
-  template <typename T>
-  void VMERF(int n, const T* a, T* y, int64_t mode) const;
-
-  template <typename T>
-  void TRSM(CBLAS_SIDE side,
-            CBLAS_UPLO uplo,
-            CBLAS_TRANSPOSE transA,
-            CBLAS_DIAG diag,
-            int M,
-            int N,
-            T alpha,
-            const T* A,
-            int lda,
-            T* B,
-            int ldb) const;
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  template <typename T>
-  void BatchedGETRF(int n, T** a, int* ipiv, int* info, int batch_size) const;
-
-  template <typename T>
-  void BatchedGETRI(int n,
-                    const T** a,
-                    const int* ipiv,
-                    T** a_inv,
-                    int* info,
-                    int batch_size) const;
-
-  template <typename T>
-  void BatchedMatInv(
-      int n, const T** a, T** a_inv, int* info, int batch_size) const;
-
-  // cuBlas solve
-  template <typename T>
-  void BatchedGETRS(CBLAS_TRANSPOSE trans,
-                    int n,
-                    int nrhs,
-                    const T** a,
-                    int lda,
-                    int* ipiv,
-                    T** b,
-                    int ldb,
-                    int* info,
-                    int batch_size) const;
-
-  // cuBlas triangular_solve
-  template <typename T>
-  void BatchedTRSM(CBLAS_SIDE side,
-                   CBLAS_UPLO uplo,
-                   CBLAS_TRANSPOSE transA,
-                   CBLAS_DIAG diag,
-                   int M,
-                   int N,
-                   T alpha,
-                   const T** a,
-                   int lda,
-                   T** b,
-                   int ldb,
-                   int batch_size) const;
-#endif
-
- private:
-  const DeviceContext& dev_ctx_;
-};
-
-template <typename DeviceContext, typename T>
-class BlasT : private Blas<DeviceContext> {
- public:
-  using Blas<DeviceContext>::Blas;
-
-  template <typename... ARGS>
-  void GEMM(ARGS... args) const {
-    Base()->template GEMM<T>(args...);
-  }
-
-#ifdef PADDLE_WITH_MKLML  // @{ Group MKLML: class BlasT
-  template <typename... ARGS>
-  T* GEMM_ALLOC(ARGS... args) const {
-    return Base()->template GEMM_ALLOC<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_PACK(ARGS... args) const {
-    Base()->template GEMM_PACK<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_COMPUTE(ARGS... args) const {
-    Base()->template GEMM_COMPUTE<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMM_FREE(ARGS... args) const {
-    Base()->template GEMM_FREE<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void CSRMM(ARGS... args) const {
-    Base()->template CSRMM<T>(args...);
-  }
-
-#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
-  template <typename... ARGS>
-  void MatMulWithHead(ARGS... args) const {
-    Base()->template MatMulWithHead<T>(args...);
-  }
-#endif
-#endif  // @} End Group MKLML: class BlasT
-
-  template <typename... ARGS>
-  void MatMul(ARGS... args) const {
-    Base()->template MatMul<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void AXPY(ARGS... args) const {
-    Base()->template AXPY<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VADD(ARGS... args) const {
-    Base()->template VADD<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VSUB(ARGS... args) const {
-    Base()->template VSUB<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VMUL(ARGS... args) const {
-    Base()->template VMUL<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VDIV(ARGS... args) const {
-    Base()->template VDIV<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VCOPY(ARGS... args) const {
-    Base()->template VCOPY<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VEXP(ARGS... args) const {
-    Base()->template VEXP<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VSQUARE(ARGS... args) const {
-    Base()->template VSQUARE<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VPOW(ARGS... args) const {
-    Base()->template VPOW<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void GEMV(ARGS... args) const {
-    Base()->template GEMV<T>(args...);
-  }
-
-  template <typename... ARGS>
-  T DOT(ARGS... args) const {
-    return Base()->template DOT<T>(args...);
-  }
-  template <typename... ARGS>
-  void CUDOT(ARGS... args) const {
-    Base()->template CUDOT<T>(args...);
-  }
-  template <typename... ARGS>
-  void SCAL(ARGS... args) const {
-    Base()->template SCAL<T>(args...);
-  }
-
-  template <typename... ARGS>
-  T ASUM(ARGS... args) const {
-    return Base()->template ASUM<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void BatchedGEMM(ARGS... args) const {
-    Base()->template BatchedGEMM<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VINV(ARGS... args) const {
-    Base()->template VINV<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void VMERF(ARGS... args) const {
-    Base()->template VMERF<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void TRSM(ARGS... args) const {
-    Base()->template TRSM<T>(args...);
-  }
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  template <typename... ARGS>
-  void BatchedGETRF(ARGS... args) const {
-    Base()->template BatchedGETRF<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void BatchedGETRI(ARGS... args) const {
-    Base()->template BatchedGETRI<T>(args...);
-  }
-
-  template <typename... ARGS>
-  void BatchedMatInv(ARGS... args) const {
-    Base()->template BatchedMatInv<T>(args...);
-  }
-
-  // solve
-  template <typename... ARGS>
-  void BatchedGETRS(ARGS... args) const {
-    Base()->template BatchedGETRS<T>(args...);
-  }
-
-  // triangular_solve
-  template <typename... ARGS>
-  void BatchedTRSM(ARGS... args) const {
-    Base()->template BatchedTRSM<T>(args...);
-  }
-#endif
-
- private:
-  const Blas<DeviceContext>* Base() const {
-    return static_cast<const Blas<DeviceContext>*>(this);
-  }
-};
-
-template <typename DeviceContext, typename T>
-inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
-  return BlasT<DeviceContext, T>(dev_ctx);
-}
-
-}  // namespace funcs
-}  // namespace phi
-// clang-format off
-#include "./blas_impl.h"
-#ifdef PADDLE_WITH_CUDA
-#include "./blas_impl.cu.h"
-#endif
-#ifdef PADDLE_WITH_HIP
-#include "paddle/phi/kernels/funcs/blas/blas_impl.hip.h"
-#endif
-// clang-format on
diff --git a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
deleted file mode 100644
index ae4baa52613..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h
+++ /dev/null
@@ -1,3027 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#if defined(__NVCC__)
-#include <thrust/device_vector.h>
-#endif
-#include "./cublas.h"
-#include "glog/logging.h"
-#include "paddle/common/flags.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-// #include "paddle/phi/core/flags.h"
-#include "paddle/phi/core/enforce.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define INT_MAX_VALUE 2147483647
-
-PHI_DECLARE_bool(enable_cublas_tensor_op_math);
-PHI_DECLARE_bool(gemm_use_half_precision_compute_type);
-
-namespace phi {
-namespace funcs {
-template <typename T>
-struct CUBlas;
-
-template <>
-struct CUBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemm(args...));
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSaxpy(args...));
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSscal(args...));
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasScopy(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemv(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmBatched(args...));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "SgemmBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  template <typename... ARGS>
-  static void GEMM_STRIDED_BATCH(ARGS... args) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasSgemmStridedBatched(args...));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "SgemmStridedBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const float *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const float *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc) {
-// Because the gcc 4.8 doesn't expand template parameter pack that
-// appears in a lambda-expression, I can not use template parameter pack
-// here.
-#if CUDA_VERSION >= 8000
-    VLOG(5) << "use_tensor_op_math: "
-            << (dev_ctx->tensor_core_available() ? "True" : "False");
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle,
-                                                             transa,
-                                                             transb,
-                                                             m,
-                                                             n,
-                                                             k,
-                                                             alpha,
-                                                             A,
-                                                             Atype,
-                                                             lda,
-                                                             B,
-                                                             Btype,
-                                                             ldb,
-                                                             beta,
-                                                             C,
-                                                             Ctype,
-                                                             ldc));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasSgemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasStrsm(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRF_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgetrfBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRI_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgetriBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void MATINV_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSmatinvBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRS_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgetrsBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasStrsmBatched(args...));
-  }
-};
-
-template <>
-struct CUBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgemm(args...));
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDaxpy(args...));
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDscal(args...));
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDcopy(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgemv(args...));
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgemmBatched(args...));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "DgemmBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  template <typename... ARGS>
-  static void GEMM_STRIDED_BATCH(ARGS... args) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasDgemmStridedBatched(args...));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "DgemmStridedBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  template <typename... ARGS>
-  static void GEMM_EX(ARGS... args UNUSED) {
-    PADDLE_THROW(
-        phi::errors::Unimplemented("Currently there are not cublasDgemmEx."));
-  }
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDtrsm(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRF_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgetrfBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRI_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgetriBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void MATINV_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDmatinvBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void GETRS_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDgetrsBatched(args...));
-  }
-
-  template <typename... ARGS>
-  static void TRSM_BATCH(ARGS... args) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasDtrsmBatched(args...));
-  }
-};
-
-template <>
-struct CUBlas<phi::dtype::float16> {
-  using float16 = phi::dtype::float16;
-
-  static void GEMM(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const float16 *alpha,
-                   const float16 *A,
-                   int lda,
-                   const float16 *B,
-                   int ldb,
-                   const float16 *beta,
-                   float16 *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasHgemm(handle,
-                                  transa,
-                                  transb,
-                                  m,
-                                  n,
-                                  k,
-                                  reinterpret_cast<const mcblas_half *>(alpha),
-                                  reinterpret_cast<const mcblas_half *>(A),
-                                  lda,
-                                  reinterpret_cast<const mcblas_half *>(B),
-                                  ldb,
-                                  reinterpret_cast<const mcblas_half *>(beta),
-                                  reinterpret_cast<mcblas_half *>(C),
-                                  ldc));
-  }
-
-#if defined(__NVCC__)
-  static void GEMM_BATCH(phi::GPUContext *dev_ctx,
-                         cublasOperation_t transa,
-                         cublasOperation_t transb,
-                         int m,
-                         int n,
-                         int k,
-                         const float *alpha,
-                         const float16 **A,
-                         cudaDataType_t Atype,
-                         int lda,
-                         const float16 **B,
-                         cudaDataType_t Btype,
-                         int ldb,
-                         const float *beta,
-                         float16 **C,
-                         cudaDataType_t Ctype,
-                         int ldc,
-                         int batchCount,
-                         cublasComputeType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-    thrust::device_vector<const void *> A_ptr(A, A + batchCount);
-    thrust::device_vector<const void *> B_ptr(B, B + batchCount);
-    thrust::device_vector<void *> C_ptr(C, C + batchCount);
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasGemmBatchedEx(handle,
-                                            transa,
-                                            transb,
-                                            m,
-                                            n,
-                                            k,
-                                            alpha,
-                                            A_ptr.data().get(),
-                                            Atype,
-                                            lda,
-                                            B_ptr.data().get(),
-                                            Btype,
-                                            ldb,
-                                            beta,
-                                            C_ptr.data().get(),
-                                            Ctype,
-                                            ldc,
-                                            batchCount,
-                                            computeType,
-                                            algo));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasGemmBatchedEx is not supported on cuda <= 7.5"));
-#endif
-  }
-#endif
-
-  static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
-                                 cublasOperation_t transa,
-                                 cublasOperation_t transb,
-                                 int m,
-                                 int n,
-                                 int k,
-                                 const float16 *alpha,
-                                 const float16 *A,
-                                 int lda,
-                                 long long int strideA,  // NOLINT
-                                 const float16 *B,       // NOLINT
-                                 int ldb,
-                                 long long int strideB,  // NOLINT
-                                 const float16 *beta,
-                                 float16 *C,
-                                 int ldc,
-                                 long long int strideC,  // NOLINT
-                                 int batchCount) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasHgemmStridedBatched(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const mcblas_half *>(alpha),
-        reinterpret_cast<const mcblas_half *>(A),
-        lda,
-        strideA,
-        reinterpret_cast<const mcblas_half *>(B),
-        ldb,
-        strideB,
-        reinterpret_cast<const mcblas_half *>(beta),
-        reinterpret_cast<mcblas_half *>(C),
-        ldc,
-        strideC,
-        batchCount));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "HgemmStridedBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc,
-                      cublasComputeType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasGemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
-};
-
-template <>
-struct CUBlas<phi::dtype::complex<float>> {
-  static void GEMV(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   const phi::dtype::complex<float> *B,
-                   int ldb,
-                   const phi::dtype::complex<float> *beta,
-                   phi::dtype::complex<float> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgemv(
-        handle,
-        transa,
-        m,
-        n,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex *>(A),
-        lda,
-        reinterpret_cast<const cuFloatComplex *>(B),
-        ldb,
-        reinterpret_cast<const cuFloatComplex *>(beta),
-        reinterpret_cast<cuFloatComplex *>(C),
-        ldc));
-  }
-
-  static void AXPY(cublasHandle_t handle,
-                   int n,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *X,
-                   const int incX,
-                   phi::dtype::complex<float> *Y,
-                   const int incY) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCaxpy(
-        handle,
-        n,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex *>(X),
-        incX,
-        reinterpret_cast<cuFloatComplex *>(Y),
-        incY));
-  }
-
-  static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
-                                 cublasOperation_t transa,
-                                 cublasOperation_t transb,
-                                 int m,
-                                 int n,
-                                 int k,
-                                 const phi::dtype::complex<float> *alpha,
-                                 const phi::dtype::complex<float> *A,
-                                 int lda,
-                                 long long int strideA,                // NOLINT
-                                 const phi::dtype::complex<float> *B,  // NOLINT
-                                 int ldb,
-                                 long long int strideB,  // NOLINT
-                                 const phi::dtype::complex<float> *beta,
-                                 phi::dtype::complex<float> *C,
-                                 int ldc,
-                                 long long int strideC,  // NOLINT
-                                 int batchCount) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgemmStridedBatched(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex *>(A),
-        lda,
-        strideA,
-        reinterpret_cast<const cuFloatComplex *>(B),
-        ldb,
-        strideB,
-        reinterpret_cast<const cuFloatComplex *>(beta),
-        reinterpret_cast<cuFloatComplex *>(C),
-        ldc,
-        strideC,
-        batchCount));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "CgemmStridedBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  static void GEMM(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   const phi::dtype::complex<float> *B,
-                   int ldb,
-                   const phi::dtype::complex<float> *beta,
-                   phi::dtype::complex<float> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgemm(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex *>(A),
-        lda,
-        reinterpret_cast<const cuFloatComplex *>(B),
-        ldb,
-        reinterpret_cast<const cuFloatComplex *>(beta),
-        reinterpret_cast<cuFloatComplex *>(C),
-        ldc));
-  }
-
-  static void TRSM(cublasHandle_t handle,
-                   cublasSideMode_t side,
-                   cublasFillMode_t uplo,
-                   cublasOperation_t transa,
-                   cublasDiagType_t diag,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<float> *alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   phi::dtype::complex<float> *B,
-                   int ldb) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCtrsm(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex *>(A),
-        lda,
-        reinterpret_cast<cuFloatComplex *>(B),
-        ldb));
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc,
-                      cublasComputeType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasGemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  static void TRSM_BATCH(cublasHandle_t handle,
-                         cublasSideMode_t side,
-                         cublasFillMode_t uplo,
-                         cublasOperation_t transa,
-                         cublasDiagType_t diag,
-                         int m,
-                         int n,
-                         const phi::dtype::complex<float> *alpha,
-                         const phi::dtype::complex<float> **A,
-                         int lda,
-                         phi::dtype::complex<float> **B,
-                         int ldb,
-                         int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCtrsmBatched(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const cuFloatComplex *>(alpha),
-        reinterpret_cast<const cuFloatComplex **>(A),
-        lda,
-        reinterpret_cast<cuFloatComplex **>(B),
-        ldb,
-        batch_size));
-  }
-  // ****************************************************************新增模版定义*********************
-
-  static void GETRF_BATCH(cublasHandle_t handle,
-                          int n,
-                          phi::dtype::complex<float> **A,
-                          int lda,
-                          int *ipiv,
-                          int *info,
-                          int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgetrfBatched(
-        handle,
-        n,
-        reinterpret_cast<cuFloatComplex **>(A),
-        lda,
-        ipiv,
-        info,
-        batch_size));
-  }
-
-  static void GETRI_BATCH(cublasHandle_t handle,
-                          int n,
-                          const phi::dtype::complex<float> **A,
-                          int lda,
-                          const int *ipiv,
-                          phi::dtype::complex<float> **Ainv,
-                          int ldc,
-                          int *info,
-                          int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCgetriBatched(
-        handle,
-        n,
-        reinterpret_cast<const cuFloatComplex **>(A),
-        lda,
-        ipiv,
-        reinterpret_cast<cuFloatComplex **>(Ainv),
-        ldc,
-        info,
-        batch_size));
-  }
-
-  static void MATINV_BATCH(cublasHandle_t handle,
-                           int n,
-                           const phi::dtype::complex<float> **A,
-                           int lda,
-                           phi::dtype::complex<float> **Ainv,
-                           int lda_inv,
-                           int *info,
-                           int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasCmatinvBatched(
-        handle,
-        n,
-        reinterpret_cast<const cuFloatComplex **>(A),
-        lda,
-        reinterpret_cast<cuFloatComplex **>(Ainv),
-        lda_inv,
-        info,
-        batch_size));
-  }
-  // ****************************************************************新增模版定义*********************
-};
-
-template <>
-struct CUBlas<phi::dtype::complex<double>> {
-  static void GEMV(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   const phi::dtype::complex<double> *B,
-                   int ldb,
-                   const phi::dtype::complex<double> *beta,
-                   phi::dtype::complex<double> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgemv(
-        handle,
-        transa,
-        m,
-        n,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex *>(A),
-        lda,
-        reinterpret_cast<const cuDoubleComplex *>(B),
-        ldb,
-        reinterpret_cast<const cuDoubleComplex *>(beta),
-        reinterpret_cast<cuDoubleComplex *>(C),
-        ldc));
-  }
-
-  static void AXPY(cublasHandle_t handle,
-                   int n,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *X,
-                   const int incX,
-                   phi::dtype::complex<double> *Y,
-                   const int incY) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZaxpy(
-        handle,
-        n,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex *>(X),
-        incX,
-        reinterpret_cast<cuDoubleComplex *>(Y),
-        incY));
-  }
-
-  static void GEMM_STRIDED_BATCH(
-      cublasHandle_t handle,
-      cublasOperation_t transa,
-      cublasOperation_t transb,
-      int m,
-      int n,
-      int k,
-      const phi::dtype::complex<double> *alpha,
-      const phi::dtype::complex<double> *A,
-      int lda,
-      long long int strideA,                 // NOLINT
-      const phi::dtype::complex<double> *B,  // NOLINT
-      int ldb,
-      long long int strideB,  // NOLINT
-      const phi::dtype::complex<double> *beta,
-      phi::dtype::complex<double> *C,
-      int ldc,
-      long long int strideC,  // NOLINT
-      int batchCount) {
-#if CUDA_VERSION >= 8000
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgemmStridedBatched(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex *>(A),
-        lda,
-        strideA,
-        reinterpret_cast<const cuDoubleComplex *>(B),
-        ldb,
-        strideB,
-        reinterpret_cast<const cuDoubleComplex *>(beta),
-        reinterpret_cast<cuDoubleComplex *>(C),
-        ldc,
-        strideC,
-        batchCount));
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "CgemmStridedBatched is not supported on cuda <= 7.5"));
-#endif
-  }
-
-  static void GEMM(cublasHandle_t handle,
-                   cublasOperation_t transa,
-                   cublasOperation_t transb,
-                   int m,
-                   int n,
-                   int k,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   const phi::dtype::complex<double> *B,
-                   int ldb,
-                   const phi::dtype::complex<double> *beta,
-                   phi::dtype::complex<double> *C,
-                   int ldc) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgemm(
-        handle,
-        transa,
-        transb,
-        m,
-        n,
-        k,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex *>(A),
-        lda,
-        reinterpret_cast<const cuDoubleComplex *>(B),
-        ldb,
-        reinterpret_cast<const cuDoubleComplex *>(beta),
-        reinterpret_cast<cuDoubleComplex *>(C),
-        ldc));
-  }
-
-  static void TRSM(cublasHandle_t handle,
-                   cublasSideMode_t side,
-                   cublasFillMode_t uplo,
-                   cublasOperation_t transa,
-                   cublasDiagType_t diag,
-                   int m,
-                   int n,
-                   const phi::dtype::complex<double> *alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   phi::dtype::complex<double> *B,
-                   int ldb) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZtrsm(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex *>(A),
-        lda,
-        reinterpret_cast<cuDoubleComplex *>(B),
-        ldb));
-  }
-
-  static void TRSM_BATCH(cublasHandle_t handle,
-                         cublasSideMode_t side,
-                         cublasFillMode_t uplo,
-                         cublasOperation_t transa,
-                         cublasDiagType_t diag,
-                         int m,
-                         int n,
-                         const phi::dtype::complex<double> *alpha,
-                         const phi::dtype::complex<double> **A,
-                         int lda,
-                         phi::dtype::complex<double> **B,
-                         int ldb,
-                         int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZtrsmBatched(
-        handle,
-        side,
-        uplo,
-        transa,
-        diag,
-        m,
-        n,
-        reinterpret_cast<const cuDoubleComplex *>(alpha),
-        reinterpret_cast<const cuDoubleComplex **>(A),
-        lda,
-        reinterpret_cast<cuDoubleComplex **>(B),
-        ldb,
-        batch_size));
-  }
-
-  // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply.
-  // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode
-  template <typename... ARGS>
-  static void GEMM_EX(phi::GPUContext *dev_ctx,
-                      cublasOperation_t transa,
-                      cublasOperation_t transb,
-                      int m,
-                      int n,
-                      int k,
-                      const void *alpha,
-                      const void *A,
-                      cudaDataType_t Atype,
-                      int lda,
-                      const void *B,
-                      cudaDataType_t Btype,
-                      int ldb,
-                      const void *beta,
-                      void *C,
-                      cudaDataType_t Ctype,
-                      int ldc,
-                      cublasComputeType_t computeType) {
-#if CUDA_VERSION >= 8000
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-    bool use_tensor_op_math = dev_ctx->tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-#endif  // CUDA_VERSION >= 9000
-
-    dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                            transa,
-                                                            transb,
-                                                            m,
-                                                            n,
-                                                            k,
-                                                            alpha,
-                                                            A,
-                                                            Atype,
-                                                            lda,
-                                                            B,
-                                                            Btype,
-                                                            ldb,
-                                                            beta,
-                                                            C,
-                                                            Ctype,
-                                                            ldc,
-                                                            computeType,
-                                                            algo));
-    });
-#else
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "cublasGemmEx is not supported on cuda <= 7.5"));
-#endif
-  }
-  // &*******************************************新增模版定义*************************
-  static void GETRF_BATCH(cublasHandle_t handle,
-                          int n,
-                          phi::dtype::complex<double> **A,
-                          int lda,
-                          int *ipiv,
-                          int *info,
-                          int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgetrfBatched(
-        handle,
-        n,
-        reinterpret_cast<cuDoubleComplex **>(A),
-        lda,
-        ipiv,
-        info,
-        batch_size));
-  }
-
-  static void GETRI_BATCH(cublasHandle_t handle,
-                          int n,
-                          const phi::dtype::complex<double> **A,
-                          int lda,
-                          const int *ipiv,
-                          phi::dtype::complex<double> **Ainv,
-                          int ldc,
-                          int *info,
-                          int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZgetriBatched(
-        handle,
-        n,
-        reinterpret_cast<const cuDoubleComplex **>(A),
-        lda,
-        ipiv,
-        reinterpret_cast<cuDoubleComplex **>(Ainv),
-        ldc,
-        info,
-        batch_size));
-  }
-
-  static void MATINV_BATCH(cublasHandle_t handle,
-                           int n,
-                           const phi::dtype::complex<double> **A,
-                           int lda,
-                           phi::dtype::complex<double> **Ainv,
-                           int lda_inv,
-                           int *info,
-                           int batch_size) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasZmatinvBatched(
-        handle,
-        n,
-        reinterpret_cast<const cuDoubleComplex **>(A),
-        lda,
-        reinterpret_cast<cuDoubleComplex **>(Ainv),
-        lda_inv,
-        info,
-        batch_size));
-  }
-  // &*******************************************新增模版定义*************************
-};
-
-inline void CheckGEMMNSize(int64_t N) {
-  constexpr int64_t kMaxN = 1073741823;
-  if (N > kMaxN) {
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublas GEMM does not support N > %ld. Got N = %ld. ", kMaxN, N));
-  }
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int64_t M,
-                                 int64_t N,
-                                 int64_t K,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-#if CUDA_VERSION >= 8000
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-      PADDLE_THROW(common::errors::Unimplemented(
-          "CUBlas<T>::GEMM_EX_64 is not complete"));
-#else
-      PADDLE_THROW(common::errors::Unimplemented(
-          "GEMM_EX_64 is not supported on cuda < 12.3"));
-#endif
-    } else {
-      CheckGEMMNSize(N);
-      CUBlas<T>::GEMM_EX(&cuda_ctx,
-                         cuTransB,
-                         cuTransA,
-                         N,
-                         M,
-                         K,
-                         &alpha,
-                         B,
-                         CUDA_R_32F,
-                         ldb,
-                         A,
-                         CUDA_R_32F,
-                         lda,
-                         &beta,
-                         C,
-                         CUDA_R_32F,
-                         N);
-    }
-  } else {
-#endif  // CUDA_VERSION >= 8000
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-      PADDLE_THROW(common::errors::Unimplemented(
-          "GEMM_EX_64 is not supported on cuda < 12.3"));
-    } else {
-      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-        CUBlas<T>::GEMM(handle,
-                        cuTransB,
-                        cuTransA,
-                        N,
-                        M,
-                        K,
-                        &alpha,
-                        B,
-                        ldb,
-                        A,
-                        lda,
-                        &beta,
-                        C,
-                        N);
-      });
-    }
-
-#if CUDA_VERSION >= 8000
-  }
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        phi::dtype::float16 alpha,
-                                        const phi::dtype::float16 *A,
-                                        const phi::dtype::float16 *B,
-                                        phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      dev_ctx_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas fp16 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          dev_ctx_.GetComputeCapability()));
-
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-
-#if CUDA_VERSION >= 8000
-  // cublasHgemm does true FP16 computation which is slow for non-Volta
-  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
-  // input/output in fp16, computation in fp32, which can also be accelerated
-  // using tensor cores in volta GPUs.
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-  CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
-                                       cuTransB,
-                                       cuTransA,
-                                       N,
-                                       M,
-                                       K,
-                                       &h_alpha,
-                                       B,
-                                       CUDA_R_16F,
-                                       ldb,
-                                       A,
-                                       CUDA_R_16F,
-                                       lda,
-                                       &h_beta,
-                                       C,
-                                       CUDA_R_16F,
-                                       N,
-                                       CUBLAS_COMPUTE_32F);
-#else
-  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<phi::dtype::float16>::GEMM(handle,
-                                      cuTransB,
-                                      cuTransA,
-                                      N,
-                                      M,
-                                      K,
-                                      &h_alpha,
-                                      h_B,
-                                      ldb,
-                                      h_A,
-                                      lda,
-                                      &h_beta,
-                                      h_C,
-                                      N);
-  });
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-template <typename T, typename U>
-void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int64_t M,
-                                 int64_t N,
-                                 int64_t K,
-                                 U alpha,
-                                 const T *A,
-                                 const T *B,
-                                 U beta,
-                                 T *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  T t_alpha = static_cast<T>(alpha);
-  T t_beta = static_cast<T>(beta);
-
-#if CUDA_VERSION >= 8000
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-      PADDLE_THROW(common::errors::Unimplemented("GEMM_EX_64 is not complete"));
-#else
-      PADDLE_THROW(common::errors::Unimplemented(
-          "GEMM_EX_64 is not supported on cuda < 12.3"));
-#endif
-    } else {
-      CheckGEMMNSize(N);
-      CUBlas<T>::GEMM_EX(&cuda_ctx,
-                         cuTransB,
-                         cuTransA,
-                         static_cast<int>(N),
-                         static_cast<int>(M),
-                         static_cast<int>(K),
-                         &t_alpha,
-                         B,
-                         CUDA_R_32F,
-                         static_cast<int>(ldb),
-                         A,
-                         CUDA_R_32F,
-                         static_cast<int>(lda),
-                         &t_beta,
-                         C,
-                         CUDA_R_32F,
-                         static_cast<int>(N));
-    }
-  } else {
-#endif  // CUDA_VERSION >= 8000
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-      PADDLE_THROW(common::errors::Unimplemented(
-          "GEMM_EX_64 is not supported on cuda < 12.3"));
-    } else {
-      dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-        CUBlas<T>::GEMM(handle,
-                        cuTransB,
-                        cuTransA,
-                        static_cast<int>(N),
-                        static_cast<int>(M),
-                        static_cast<int>(K),
-                        &t_alpha,
-                        B,
-                        static_cast<int>(ldb),
-                        A,
-                        static_cast<int>(lda),
-                        &t_beta,
-                        C,
-                        static_cast<int>(N));
-      });
-    }
-
-#if CUDA_VERSION >= 8000
-  }
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        float alpha,
-                                        const phi::dtype::float16 *A,
-                                        const phi::dtype::float16 *B,
-                                        float beta,
-                                        phi::dtype::float16 *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  // PADDLE_ENFORCE_GE(
-  //     dev_ctx_.GetComputeCapability(),
-  //     53,
-  //     common::errors::InvalidArgument(
-  //         "cublas fp16 gemm requires GPU compute capability >= 53,"
-  //         "but received %d",
-  //         dev_ctx_.GetComputeCapability()));
-
-  float h_alpha = alpha;
-  float h_beta = beta;
-
-#if CUDA_VERSION >= 8000
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-#endif
-  // cublasHgemm does true FP16 computation which is slow for non-Volta
-  // GPUs. So use cublasGemmEx instead which does pseudo FP16 computation:
-  // input/output in fp16, computation in fp32, which can also be accelerated
-  // using tensor cores in volta GPUs.
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(common::errors::Unimplemented("GEMM_EX_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "GEMM_EX_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-#if CUDA_VERSION >= 8000
-    CheckGEMMNSize(N);
-    CUBlas<phi::dtype::float16>::GEMM_EX(&cuda_ctx,
-                                         cuTransB,
-                                         cuTransA,
-                                         static_cast<int>(N),
-                                         static_cast<int>(M),
-                                         static_cast<int>(K),
-                                         &h_alpha,
-                                         B,
-                                         CUDA_R_16F,
-                                         static_cast<int>(ldb),
-                                         A,
-                                         CUDA_R_16F,
-                                         static_cast<int>(lda),
-                                         &h_beta,
-                                         C,
-                                         CUDA_R_16F,
-                                         static_cast<int>(N),
-                                         CUBLAS_COMPUTE_32F);
-#else
-    // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<phi::dtype::float16>::GEMM(handle,
-                                        cuTransB,
-                                        cuTransA,
-                                        static_cast<int>(N),
-                                        static_cast<int>(M),
-                                        static_cast<int>(K),
-                                        &h_alpha,
-                                        h_B,
-                                        static_cast<int>(ldb),
-                                        h_A,
-                                        static_cast<int>(lda),
-                                        &h_beta,
-                                        h_C,
-                                        static_cast<int>(N));
-    });
-#endif  // CUDA_VERSION >= 8000
-  }
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        phi::dtype::bfloat16 alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        const phi::dtype::bfloat16 *B,
-                                        phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  PADDLE_ENFORCE_GE(
-      dev_ctx_.GetComputeCapability(),
-      80,
-      phi::errors::InvalidArgument(
-          "cublas bf16 gemm requires GPU compute capability >= 80,"
-          "but received %d",
-          dev_ctx_.GetComputeCapability()));
-
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(
-        common::errors::Unimplemented("cublasGemmEx_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-    CheckGEMMNSize(N);
-    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                            cuTransB,
-                                                            cuTransA,
-                                                            N,
-                                                            M,
-                                                            K,
-                                                            &h_alpha,
-                                                            B,
-                                                            CUDA_R_16BF,
-                                                            ldb,
-                                                            A,
-                                                            CUDA_R_16BF,
-                                                            lda,
-                                                            &h_beta,
-                                                            C,
-                                                            CUDA_R_16BF,
-                                                            N,
-                                                            CUBLAS_COMPUTE_32F,
-                                                            algo));
-    });
-  }
-#else
-  // raise error
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
-
-#endif  // CUDA_VERSION >= 11000
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        phi::dtype::complex<float> alpha,
-                                        const phi::dtype::complex<float> *A,
-                                        const phi::dtype::complex<float> *B,
-                                        phi::dtype::complex<float> beta,
-                                        phi::dtype::complex<float> *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      dev_ctx_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas complex64 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          dev_ctx_.GetComputeCapability()));
-
-  thrust::complex<float> c_alpha =
-      thrust::complex<float>(alpha.real, alpha.imag);
-  thrust::complex<float> c_beta = thrust::complex<float>(beta.real, beta.imag);
-
-#if CUDA_VERSION >= 8000
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-#endif
-
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(common::errors::Unimplemented("GEMM_EX_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "GEMM_EX_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-#if CUDA_VERSION >= 8000
-    CheckGEMMNSize(N);
-    CUBlas<phi::dtype::complex<float>>::GEMM_EX(&cuda_ctx,
-                                                cuTransB,
-                                                cuTransA,
-                                                static_cast<int>(N),
-                                                static_cast<int>(M),
-                                                static_cast<int>(K),
-                                                &c_alpha,
-                                                B,
-                                                CUDA_C_32F,
-                                                static_cast<int>(ldb),
-                                                A,
-                                                CUDA_C_32F,
-                                                static_cast<int>(lda),
-                                                &c_beta,
-                                                C,
-                                                CUDA_C_32F,
-                                                static_cast<int>(N),
-                                                CUBLAS_COMPUTE_32F);
-#else
-    // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<phi::dtype::complex<float>>::GEMM(handle,
-                                               cuTransB,
-                                               cuTransA,
-                                               static_cast<int>(N),
-                                               static_cast<int>(M),
-                                               static_cast<int>(K),
-                                               &c_alpha,
-                                               h_B,
-                                               static_cast<int>(ldb),
-                                               h_A,
-                                               static_cast<int>(lda),
-                                               &c_beta,
-                                               h_C,
-                                               static_cast<int>(N));
-    });
-#endif  // CUDA_VERSION >= 8000
-  }
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        phi::dtype::complex<double> alpha,
-                                        const phi::dtype::complex<double> *A,
-                                        const phi::dtype::complex<double> *B,
-                                        phi::dtype::complex<double> beta,
-                                        phi::dtype::complex<double> *C) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(
-      dev_ctx_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas complex128 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          dev_ctx_.GetComputeCapability()));
-
-  thrust::complex<double> c_alpha =
-      thrust::complex<double>(alpha.real, alpha.imag);
-  thrust::complex<double> c_beta =
-      thrust::complex<double>(beta.real, beta.imag);
-
-#if CUDA_VERSION >= 8000
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-#endif
-
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(common::errors::Unimplemented("GEMM_EX_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "GEMM_EX_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-#if CUDA_VERSION >= 8000
-    CheckGEMMNSize(N);
-    CUBlas<phi::dtype::complex<double>>::GEMM_EX(&cuda_ctx,
-                                                 cuTransB,
-                                                 cuTransA,
-                                                 static_cast<int>(N),
-                                                 static_cast<int>(M),
-                                                 static_cast<int>(K),
-                                                 &c_alpha,
-                                                 B,
-                                                 CUDA_C_64F,
-                                                 static_cast<int>(ldb),
-                                                 A,
-                                                 CUDA_C_64F,
-                                                 static_cast<int>(lda),
-                                                 &c_beta,
-                                                 C,
-                                                 CUDA_C_64F,
-                                                 static_cast<int>(N),
-                                                 CUBLAS_COMPUTE_64F);
-#else
-    // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<phi::dtype::complex<double>>::GEMM(handle,
-                                                cuTransB,
-                                                cuTransA,
-                                                static_cast<int>(N),
-                                                static_cast<int>(M),
-                                                static_cast<int>(K),
-                                                &c_alpha,
-                                                h_B,
-                                                static_cast<int>(ldb),
-                                                h_A,
-                                                static_cast<int>(lda),
-                                                &c_beta,
-                                                h_C,
-                                                static_cast<int>(N));
-    });
-#endif  // CUDA_VERSION >= 8000
-  }
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        float alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        const phi::dtype::bfloat16 *B,
-                                        float beta,
-                                        phi::dtype::bfloat16 *C) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // PADDLE_ENFORCE_GE(
-  //     dev_ctx_.GetComputeCapability(),
-  //     80,
-  //     common::errors::InvalidArgument(
-  //         "cublas bf16 gemm requires GPU compute capability >= 80,"
-  //         "but received %d",
-  //         dev_ctx_.GetComputeCapability()));
-
-  float h_alpha = alpha;
-  float h_beta = beta;
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(
-        common::errors::Unimplemented("cublasGemmEx_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-    CheckGEMMNSize(N);
-    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasGemmEx(handle,
-                                     cuTransB,
-                                     cuTransA,
-                                     static_cast<int>(N),
-                                     static_cast<int>(M),
-                                     static_cast<int>(K),
-                                     &h_alpha,
-                                     B,
-                                     CUDA_R_16BF,
-                                     static_cast<int>(ldb),
-                                     A,
-                                     CUDA_R_16BF,
-                                     static_cast<int>(lda),
-                                     &h_beta,
-                                     C,
-                                     CUDA_R_16BF,
-                                     static_cast<int>(N),
-                                     CUDA_R_32F,
-                                     algo));
-    });
-  }
-#else
-  // raise error
-  PADDLE_THROW(common::errors::Unimplemented(
-      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
-
-#endif  // CUDA_VERSION >= 11000
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::GEMM(bool transA,
-                                 bool transB,
-                                 int M,
-                                 int N,
-                                 int K,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 const T *B,
-                                 int ldb,
-                                 T beta,
-                                 T *C,
-                                 int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-#if CUDA_VERSION >= 8000
-  if (FLAGS_enable_cublas_tensor_op_math && std::is_same<T, float>::value) {
-    auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-    CUBlas<T>::GEMM_EX(&cuda_ctx,
-                       cuTransB,
-                       cuTransA,
-                       N,
-                       M,
-                       K,
-                       &alpha,
-                       B,
-                       CUDA_R_32F,
-                       ldb,
-                       A,
-                       CUDA_R_32F,
-                       lda,
-                       &beta,
-                       C,
-                       CUDA_R_32F,
-                       ldc);
-  } else {
-#endif  // CUDA_VERSION >= 8000
-
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<T>::GEMM(handle,
-                      cuTransB,
-                      cuTransA,
-                      N,
-                      M,
-                      K,
-                      &alpha,
-                      B,
-                      ldb,
-                      A,
-                      lda,
-                      &beta,
-                      C,
-                      ldc);
-    });
-
-#if CUDA_VERSION >= 8000
-  }
-#endif  // CUDA_VERSION >= 8000
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(bool transA,
-                                        bool transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::float16 alpha,
-                                        const phi::dtype::float16 *A,
-                                        int lda,
-                                        const phi::dtype::float16 *B,
-                                        int ldb,
-                                        phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C,
-                                        int ldc) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<phi::dtype::float16>::GEMM(handle,
-                                      cuTransB,
-                                      cuTransA,
-                                      N,
-                                      M,
-                                      K,
-                                      &alpha,
-                                      B,
-                                      ldb,
-                                      A,
-                                      lda,
-                                      &beta,
-                                      C,
-                                      ldc);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMM(bool transA,
-                                        bool transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        phi::dtype::bfloat16 alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        int lda,
-                                        const phi::dtype::bfloat16 *B,
-                                        int ldb,
-                                        phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C,
-                                        int ldc) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
-  cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  // PADDLE_ENFORCE_GE(
-  //     dev_ctx_.GetComputeCapability(),
-  //     80,
-  //     phi::errors::InvalidArgument(
-  //         "cublas bf16 gemm requires GPU compute capability >= 80,"
-  //         "but received %d",
-  //         dev_ctx_.GetComputeCapability()));
-
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-
-  dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
-                                                          cuTransB,
-                                                          cuTransA,
-                                                          N,
-                                                          M,
-                                                          K,
-                                                          &h_alpha,
-                                                          B,
-                                                          CUDA_R_16BF,
-                                                          ldb,
-                                                          A,
-                                                          CUDA_R_16BF,
-                                                          lda,
-                                                          &h_beta,
-                                                          C,
-                                                          CUDA_R_16BF,
-                                                          ldc,
-                                                          CUBLAS_COMPUTE_32F,
-                                                          algo));
-  });
-#else
-  // raise error
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
-
-#endif  // CUDA_VERSION >= 11000
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::AXPY(handle, n, &alpha, x, 1, y, 1);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::SCAL(int n, const T alpha, T *x) const {
-  dev_ctx_.CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::SCAL(handle, n, &alpha, x, 1); });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::VCOPY(int n, const T *x, T *y) const {
-  dev_ctx_.CublasCall(
-      [&](cublasHandle_t handle) { CUBlas<T>::VCOPY(handle, n, x, 1, y, 1); });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                 int M,
-                                 int N,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                        int M,
-                                        int N,
-                                        phi::dtype::float16 alpha,
-                                        const phi::dtype::float16 *A,
-                                        const phi::dtype::float16 *B,
-                                        phi::dtype::float16 beta,
-                                        phi::dtype::float16 *C) const {
-  // Because cublas doesn't support half gemv, we use cublasHgemm to achieve it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::float16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::GEMV(bool trans_a,
-                                        int M,
-                                        int N,
-                                        phi::dtype::bfloat16 alpha,
-                                        const phi::dtype::bfloat16 *A,
-                                        const phi::dtype::bfloat16 *B,
-                                        phi::dtype::bfloat16 beta,
-                                        phi::dtype::bfloat16 *C) const {
-  // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
-  // it.
-  if (trans_a) {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, 1, N, M, alpha, B, A, beta, C);
-  } else {
-    this->template GEMM<phi::dtype::bfloat16>(
-        CblasNoTrans, CblasNoTrans, M, 1, N, alpha, A, B, beta, C);
-  }
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        T alpha,
-                                        const T *A,
-                                        const T *B,
-                                        T beta,
-                                        T *C,
-                                        int64_t batchCount,
-                                        int64_t strideA,
-                                        int64_t strideB) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  int64_t ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int64_t strideC = M * N;
-
-#if CUDA_VERSION >= 9010
-  if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
-      std::is_same<T, phi::dtype::float16>::value) {
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-    VLOG(4) << "use_half_precision_compute_type: "
-            << FLAGS_gemm_use_half_precision_compute_type;
-
-    auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
-#if CUDA_VERSION >= 11000
-    auto compute_type = CUBLAS_COMPUTE_32F;
-#else
-    auto compute_type = CUDA_R_32F;
-#endif
-
-    float h_alpha = static_cast<float>(alpha);
-    float h_beta = static_cast<float>(beta);
-    void *a = static_cast<void *>(&h_alpha);
-    void *b = static_cast<void *>(&h_beta);
-    // set ComputeType as CUDA_R_32F for fp16, for better accuracy
-    if (FLAGS_gemm_use_half_precision_compute_type == true &&
-        std::is_same<T, phi::dtype::float16>::value) {
-      a = static_cast<void *>(&alpha);
-      b = static_cast<void *>(&beta);
-#if CUDA_VERSION >= 11000
-      compute_type = CUBLAS_COMPUTE_16F;
-#else
-      compute_type = CUDA_R_16F;
-#endif
-    }
-
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-      PADDLE_THROW(common::errors::Unimplemented(
-          "cublasGemmStridedBatchedEx_64 is not complete"));
-#else
-      PADDLE_THROW(common::errors::Unimplemented(
-          "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-    } else {
-      dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            phi::dynload::cublasGemmStridedBatchedEx(handle,
-                                                     cuTransB,
-                                                     cuTransA,
-                                                     N,
-                                                     M,
-                                                     K,
-                                                     a,
-                                                     B,
-                                                     fp,
-                                                     ldb,
-                                                     strideB,
-                                                     A,
-                                                     fp,
-                                                     lda,
-                                                     strideA,
-                                                     b,
-                                                     C,
-                                                     fp,
-                                                     ldc,
-                                                     strideC,
-                                                     batchCount,
-                                                     compute_type,
-                                                     algo));
-      });
-    }
-  } else {
-#endif  // CUDA_VERSION >= 9010
-
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                    cuTransB,
-                                    cuTransA,
-                                    static_cast<int>(N),
-                                    static_cast<int>(M),
-                                    static_cast<int>(K),
-                                    &alpha,
-                                    B,
-                                    static_cast<int>(ldb),
-                                    strideB,
-                                    A,
-                                    static_cast<int>(lda),
-                                    strideA,
-                                    &beta,
-                                    C,
-                                    ldc,
-                                    strideC,
-                                    static_cast<int>(batchCount));
-    });
-
-#if CUDA_VERSION >= 9010
-  }
-#endif  // CUDA_VERSION >= 9010
-}
-
-template <>
-template <typename T, typename U>
-void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        U alpha,
-                                        const T *A,
-                                        const T *B,
-                                        U beta,
-                                        T *C,
-                                        int64_t batchCount,
-                                        int64_t strideA,
-                                        int64_t strideB) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  int64_t ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int64_t strideC = M * N;
-#if CUDA_VERSION >= 9010
-  if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same<T, float>::value)) ||
-      std::is_same<T, phi::dtype::float16>::value) {
-    cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-    bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-    if (use_tensor_op_math) {
-      algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-    }
-    VLOG(5) << "use_tensor_op_math: "
-            << (use_tensor_op_math ? "True" : "False");
-    VLOG(4) << "use_half_precision_compute_type: "
-            << FLAGS_gemm_use_half_precision_compute_type;
-
-    auto fp = std::is_same<T, float>::value ? CUDA_R_32F : CUDA_R_16F;
-#if CUDA_VERSION >= 11000
-    auto compute_type = CUBLAS_COMPUTE_32F;
-#else
-    auto compute_type = CUDA_R_32F;
-#endif
-
-    float h_alpha = static_cast<float>(alpha);
-    float h_beta = static_cast<float>(beta);
-    void *a = static_cast<void *>(&h_alpha);
-    void *b = static_cast<void *>(&h_beta);
-    // set ComputeType as CUDA_R_32F for fp16, for better accuracy
-    if (FLAGS_gemm_use_half_precision_compute_type == true &&
-        std::is_same<T, phi::dtype::float16>::value) {
-      a = static_cast<void *>(&alpha);
-      b = static_cast<void *>(&beta);
-#if CUDA_VERSION >= 11000
-      compute_type = CUBLAS_COMPUTE_16F;
-#else
-      compute_type = CUDA_R_16F;
-#endif
-    }
-
-    if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE ||
-        batchCount > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-      PADDLE_THROW(common::errors::Unimplemented(
-          "cublasGemmStridedBatchedEx_64 is not complete"));
-#else
-      PADDLE_THROW(common::errors::Unimplemented(
-          "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-    } else {
-      dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-        PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx(
-            handle,
-            cuTransB,
-            cuTransA,
-            static_cast<int>(N),
-            static_cast<int>(M),
-            static_cast<int>(K),
-            a,
-            B,
-            fp,
-            static_cast<int>(ldb),
-            strideB,
-            A,
-            fp,
-            static_cast<int>(lda),
-            strideA,
-            b,
-            C,
-            fp,
-            static_cast<int>(ldc),
-            strideC,
-            static_cast<int>(batchCount),
-            compute_type,
-            algo));
-      });
-    }
-  } else {
-#endif  // CUDA_VERSION >= 9010
-    T h_alpha = static_cast<T>(alpha);
-    T h_beta = static_cast<T>(beta);
-    dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-      CUBlas<T>::GEMM_STRIDED_BATCH(handle,
-                                    cuTransB,
-                                    cuTransA,
-                                    static_cast<int>(N),
-                                    static_cast<int>(M),
-                                    static_cast<int>(K),
-                                    &h_alpha,
-                                    B,
-                                    static_cast<int>(ldb),
-                                    strideB,
-                                    A,
-                                    static_cast<int>(lda),
-                                    strideA,
-                                    &h_beta,
-                                    C,
-                                    static_cast<int>(ldc),
-                                    strideC,
-                                    static_cast<int>(batchCount));
-    });
-
-#if CUDA_VERSION >= 9010
-  }
-#endif  // CUDA_VERSION >= 9010
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int64_t M,
-                                               int64_t N,
-                                               int64_t K,
-                                               phi::dtype::bfloat16 alpha,
-                                               const phi::dtype::bfloat16 *A,
-                                               const phi::dtype::bfloat16 *B,
-                                               phi::dtype::bfloat16 beta,
-                                               phi::dtype::bfloat16 *C,
-                                               int64_t batchCount,
-                                               int64_t strideA,
-                                               int64_t strideB) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int64_t lda = (transA == CblasNoTrans) ? K : M;
-  int64_t ldb = (transB == CblasNoTrans) ? N : K;
-  int64_t ldc = N;
-
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int64_t strideC = M * N;
-
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE ||
-      batchCount > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmStridedBatchedEx_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasGemmStridedBatchedEx(handle,
-                                                   cuTransB,
-                                                   cuTransA,
-                                                   static_cast<int>(N),
-                                                   static_cast<int>(M),
-                                                   static_cast<int>(K),
-                                                   &h_alpha,
-                                                   B,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(ldb),
-                                                   strideB,
-                                                   A,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(lda),
-                                                   strideA,
-                                                   &h_beta,
-                                                   C,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(ldc),
-                                                   strideC,
-                                                   static_cast<int>(batchCount),
-                                                   CUBLAS_COMPUTE_32F,
-                                                   algo));
-    });
-  }
-#else
-  // raise error
-  PADDLE_THROW(common::errors::Unimplemented(
-      "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
-      "11"));
-#endif  // CUDA_VERSION >= 11000
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int64_t M,
-                                               int64_t N,
-                                               int64_t K,
-                                               float alpha,
-                                               const phi::dtype::bfloat16 *A,
-                                               const phi::dtype::bfloat16 *B,
-                                               float beta,
-                                               phi::dtype::bfloat16 *C,
-                                               int64_t batchCount,
-                                               int64_t strideA,
-                                               int64_t strideB) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const int64_t strideC = M * N;
-
-  float h_alpha = alpha;
-  float h_beta = beta;
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE ||
-      batchCount > INT_MAX_VALUE) {
-#if CUDA_VERSION >= 12030 && defined(__linux__)
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmStridedBatchedEx_64 is not complete"));
-#else
-    PADDLE_THROW(common::errors::Unimplemented(
-        "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3"));
-#endif  // CUDA_VERSION >= 12030
-  } else {
-    dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cublasGemmStridedBatchedEx(handle,
-                                                   cuTransB,
-                                                   cuTransA,
-                                                   static_cast<int>(N),
-                                                   static_cast<int>(M),
-                                                   static_cast<int>(K),
-                                                   &h_alpha,
-                                                   B,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(ldb),
-                                                   strideB,
-                                                   A,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(lda),
-                                                   strideA,
-                                                   &h_beta,
-                                                   C,
-                                                   CUDA_R_16BF,
-                                                   static_cast<int>(ldc),
-                                                   strideC,
-                                                   static_cast<int>(batchCount),
-                                                   CUBLAS_COMPUTE_32F,
-                                                   algo));
-    });
-  }
-#else
-  // raise error
-  PADDLE_THROW(common::errors::Unimplemented(
-      "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
-      "11"));
-#endif  // CUDA_VERSION >= 11000
-}
-
-// /***
-//  * Uknow bug, parameters dislocation when calling BatchedGEMM<float16>.
-//  * Reference: paddle github PR #45530 and #55612
-//  */
-// template <>
-// template <>
-// inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-//                                                CBLAS_TRANSPOSE transB,
-//                                                int M,
-//                                                int N,
-//                                                int K,
-//                                                float16 alpha,
-//                                                const float16 *A,
-//                                                const float16 *B,
-//                                                float16 beta,
-//                                                float16 *C,
-//                                                int batchCount,
-//                                                int64_t strideA,
-//                                                int64_t strideB) const {
-//   // Note that cublas follows fortran order, so the order is different from
-//   // the cblas convention.
-//   int lda = (transA == CblasNoTrans) ? K : M;
-//   int ldb = (transB == CblasNoTrans) ? N : K;
-//   int ldc = N;
-//   cublasOperation_t cuTransA =
-//       (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   cublasOperation_t cuTransB =
-//       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   const int64_t strideC = M * N;
-
-// #if CUDA_VERSION >= 9010
-//   if ((FLAGS_enable_cublas_tensor_op_math &&
-//        (std::is_same<float16, float>::value)) ||
-//       std::is_same<float16, phi::dtype::float16>::value) {
-//     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-//     bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-//     if (use_tensor_op_math) {
-//       algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-//     }
-//     VLOG(5) << "use_tensor_op_math: "
-//             << (use_tensor_op_math ? "True" : "False");
-//     VLOG(4) << "use_half_precision_compute_type: "
-//             << FLAGS_gemm_use_half_precision_compute_type;
-
-//     auto fp = std::is_same<float16, float>::value ? CUDA_R_32F : CUDA_R_16F;
-// #if CUDA_VERSION >= 11000
-//     auto compute_type = CUBLAS_COMPUTE_32F;
-// #else
-//     auto compute_type = CUDA_R_32F;
-// #endif
-
-//     float h_alpha = static_cast<float>(alpha);
-//     float h_beta = static_cast<float>(beta);
-//     void *a = static_cast<void *>(&h_alpha);
-//     void *b = static_cast<void *>(&h_beta);
-//     // set ComputeType as CUDA_R_32F for fp16, for better accuracy
-//     if (FLAGS_gemm_use_half_precision_compute_type == true &&
-//         std::is_same<float16, phi::dtype::float16>::value) {
-//       a = static_cast<void *>(&alpha);
-//       b = static_cast<void *>(&beta);
-// #if CUDA_VERSION >= 11000
-//       compute_type = CUBLAS_COMPUTE_16F;
-// #else
-//       compute_type = CUDA_R_16F;
-// #endif
-//     }
-
-//     dev_ctx_.TensorCoreCublasCallIfAvailable(
-//         [&](cublasHandle_t handle) {
-//           PADDLE_ENFORCE_GPU_SUCCESS(
-//               phi::dynload::cublasGemmStridedBatchedEx(handle,
-//                                                        cuTransB,
-//                                                        cuTransA,
-//                                                        N,
-//                                                        M,
-//                                                        K,
-//                                                        a,
-//                                                        B,
-//                                                        fp,
-//                                                        ldb,
-//                                                        strideB,
-//                                                        A,
-//                                                        fp,
-//                                                        lda,
-//                                                        strideA,
-//                                                        b,
-//                                                        C,
-//                                                        fp,
-//                                                        ldc,
-//                                                        strideC,
-//                                                        batchCount,
-//                                                        compute_type,
-//                                                        algo));
-//         });
-//   } else {
-// #endif  // CUDA_VERSION >= 9010
-
-//     dev_ctx_.CublasCall(
-//         [&](cublasHandle_t handle) {
-//           CUBlas<float16>::GEMM_STRIDED_BATCH(handle,
-//                                               cuTransB,
-//                                               cuTransA,
-//                                               N,
-//                                               M,
-//                                               K,
-//                                               &alpha,
-//                                               B,
-//                                               ldb,
-//                                               strideB,
-//                                               A,
-//                                               lda,
-//                                               strideA,
-//                                               &beta,
-//                                               C,
-//                                               ldc,
-//                                               strideC,
-//                                               batchCount);
-//         },
-//         dev_ctx_.stream());
-
-// #if CUDA_VERSION >= 9010
-//   }
-// #endif  // CUDA_VERSION >= 9010
-// }
-
-// /***
-//  * Uknow bug, parameters dislocation when calling BatchedGEMM<double>.
-//  * Reference: paddle github PR #45530 and #55612
-//  */
-// template <>
-// template <>
-// inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-//                                                CBLAS_TRANSPOSE transB,
-//                                                int M,
-//                                                int N,
-//                                                int K,
-//                                                double alpha,
-//                                                const double *A,
-//                                                const double *B,
-//                                                double beta,
-//                                                double *C,
-//                                                int batchCount,
-//                                                int64_t strideA,
-//                                                int64_t strideB) const {
-//   // Note that cublas follows fortran order, so the order is different from
-//   // the cblas convention.
-//   int lda = (transA == CblasNoTrans) ? K : M;
-//   int ldb = (transB == CblasNoTrans) ? N : K;
-//   int ldc = N;
-//   cublasOperation_t cuTransA =
-//       (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   cublasOperation_t cuTransB =
-//       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   const int64_t strideC = M * N;
-//   dev_ctx_.CublasCall(
-//       [&](cublasHandle_t handle) {
-//         PADDLE_ENFORCE_GPU_SUCCESS(
-//             phi::dynload::cublasDgemmStridedBatched(handle,
-//                                                     cuTransB,
-//                                                     cuTransA,
-//                                                     N,
-//                                                     M,
-//                                                     K,
-//                                                     &alpha,
-//                                                     B,
-//                                                     ldb,
-//                                                     strideB,
-//                                                     A,
-//                                                     lda,
-//                                                     strideA,
-//                                                     &beta,
-//                                                     C,
-//                                                     ldc,
-//                                                     strideC,
-//                                                     batchCount));
-//       },
-//       dev_ctx_.stream());
-// }
-
-// template <>
-// template <>
-// inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-//                                                CBLAS_TRANSPOSE transB,
-//                                                int M,
-//                                                int N,
-//                                                int K,
-//                                                phi::dtype::bfloat16 alpha,
-//                                                const phi::dtype::bfloat16 *A,
-//                                                const phi::dtype::bfloat16 *B,
-//                                                phi::dtype::bfloat16 beta,
-//                                                phi::dtype::bfloat16 *C,
-//                                                int batchCount,
-//                                                int64_t strideA,
-//                                                int64_t strideB) const {
-// #if CUDA_VERSION >= 11000
-//   // Note that cublas follows fortran order, so the order is different from
-//   // the cblas convention.
-//   int lda = (transA == CblasNoTrans) ? K : M;
-//   int ldb = (transB == CblasNoTrans) ? N : K;
-//   int ldc = N;
-//   cublasOperation_t cuTransA =
-//       (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   cublasOperation_t cuTransB =
-//       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-//   const int64_t strideC = M * N;
-
-//   float h_alpha = static_cast<float>(alpha);
-//   float h_beta = static_cast<float>(beta);
-
-//   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-//   bool use_tensor_op_math = dev_ctx->tensor_core_available();
-//   if (use_tensor_op_math) {
-//     algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-//   }
-//   VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" :
-//   "False");
-
-//   dev_ctx_.TensorCoreCublasCallIfAvailable(
-//       [&](cublasHandle_t handle) {
-//         PADDLE_ENFORCE_GPU_SUCCESS(
-//             phi::dynload::cublasGemmStridedBatchedEx(handle,
-//                                                      cuTransB,
-//                                                      cuTransA,
-//                                                      N,
-//                                                      M,
-//                                                      K,
-//                                                      &h_alpha,
-//                                                      B,
-//                                                      CUDA_R_16BF,
-//                                                      ldb,
-//                                                      strideB,
-//                                                      A,
-//                                                      CUDA_R_16BF,
-//                                                      lda,
-//                                                      strideA,
-//                                                      &h_beta,
-//                                                      C,
-//                                                      CUDA_R_16BF,
-//                                                      ldc,
-//                                                      strideC,
-//                                                      batchCount,
-//                                                      CUBLAS_COMPUTE_32F,
-//                                                      algo));
-//       });
-// #else
-//   // raise error
-//   PADDLE_THROW(phi::errors::Unimplemented(
-//       "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
-//       "11"));
-// #endif  // CUDA_VERSION >= 11000
-// }
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        T alpha,
-                                        const T **A,
-                                        const T **B,
-                                        T beta,
-                                        T **C,
-                                        int batchCount) const {
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<T>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-}
-
-#if defined(__NVCC__)
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               double alpha,
-                                               const double **A,
-                                               const double **B,
-                                               double beta,
-                                               double **C,
-                                               int batchCount) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  thrust::device_vector<const double *> A_ptr(A, A + batchCount);
-  thrust::device_vector<const double *> B_ptr(B, B + batchCount);
-  thrust::device_vector<double *> C_ptr(C, C + batchCount);
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<double>::GEMM_BATCH(handle,
-                               cuTransB,
-                               cuTransA,
-                               N,
-                               M,
-                               K,
-                               &alpha,
-                               B_ptr.data().get(),
-                               ldb,
-                               A_ptr.data().get(),
-                               lda,
-                               &beta,
-                               C_ptr.data().get(),
-                               ldc,
-                               batchCount);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               float alpha,
-                                               const float **A,
-                                               const float **B,
-                                               float beta,
-                                               float **C,
-                                               int batchCount) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  thrust::device_vector<const float *> A_ptr(A, A + batchCount);
-  thrust::device_vector<const float *> B_ptr(B, B + batchCount);
-  thrust::device_vector<float *> C_ptr(C, C + batchCount);
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<float>::GEMM_BATCH(handle,
-                              cuTransB,
-                              cuTransA,
-                              N,
-                              M,
-                              K,
-                              &alpha,
-                              B_ptr.data().get(),
-                              ldb,
-                              A_ptr.data().get(),
-                              lda,
-                              &beta,
-                              C_ptr.data().get(),
-                              ldc,
-                              batchCount);
-  });
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               phi::dtype::float16 alpha,
-                                               const phi::dtype::float16 **A,
-                                               const phi::dtype::float16 **B,
-                                               phi::dtype::float16 beta,
-                                               phi::dtype::float16 **C,
-                                               int batchCount) const {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  PADDLE_ENFORCE_GE(
-      dev_ctx_.GetComputeCapability(),
-      53,
-      phi::errors::InvalidArgument(
-          "cublas fp16 gemm requires GPU compute capability >= 53,"
-          "but received %d",
-          dev_ctx_.GetComputeCapability()));
-  float f_alpha = static_cast<float>(alpha);
-  float f_beta = static_cast<float>(beta);
-  auto &cuda_ctx = const_cast<phi::GPUContext &>(dev_ctx_);
-  CUBlas<phi::dtype::float16>::GEMM_BATCH(&cuda_ctx,
-                                          cuTransB,
-                                          cuTransA,
-                                          N,
-                                          M,
-                                          K,
-                                          &f_alpha,
-                                          B,
-                                          CUDA_R_16F,
-                                          ldb,
-                                          A,
-                                          CUDA_R_16F,
-                                          lda,
-                                          &f_beta,
-                                          C,
-                                          CUDA_R_16F,
-                                          ldc,
-                                          batchCount,
-                                          CUBLAS_COMPUTE_32F);
-}
-
-template <>
-template <>
-inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                               CBLAS_TRANSPOSE transB,
-                                               int M,
-                                               int N,
-                                               int K,
-                                               phi::dtype::bfloat16 alpha,
-                                               const phi::dtype::bfloat16 **A,
-                                               const phi::dtype::bfloat16 **B,
-                                               phi::dtype::bfloat16 beta,
-                                               phi::dtype::bfloat16 **C,
-                                               int batchCount) const {
-#if CUDA_VERSION >= 11000
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-
-  // PADDLE_ENFORCE_GE(
-  //     dev_ctx_.GetComputeCapability(),
-  //     80,
-  //     phi::errors::InvalidArgument(
-  //         "cublas bf16 gemm requires GPU compute capability >= 80,"
-  //         "but received %d",
-  //         dev_ctx_.GetComputeCapability()));
-
-  float f_alpha = static_cast<float>(alpha);
-  float f_beta = static_cast<float>(beta);
-
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
-  bool use_tensor_op_math = dev_ctx_.tensor_core_available();
-  if (use_tensor_op_math) {
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  }
-  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
-
-  thrust::device_vector<const void *> A_ptr(A, A + batchCount);
-  thrust::device_vector<const void *> B_ptr(B, B + batchCount);
-  thrust::device_vector<void *> C_ptr(C, C + batchCount);
-  dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        phi::dynload::cublasGemmBatchedEx(handle,
-                                          cuTransB,
-                                          cuTransA,
-                                          N,
-                                          M,
-                                          K,
-                                          &f_alpha,
-                                          B_ptr.data().get(),
-                                          CUDA_R_16BF,
-                                          ldb,
-                                          A_ptr.data().get(),
-                                          CUDA_R_16BF,
-                                          lda,
-                                          &f_beta,
-                                          C_ptr.data().get(),
-                                          CUDA_R_16BF,
-                                          ldc,
-                                          batchCount,
-                                          CUBLAS_COMPUTE_32F,
-                                          algo));
-  });
-#else
-  // raise error
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "cublasGemmBatchedEx with bfloat16 is not supported on cuda <= 11"));
-
-#endif  // CUDA_VERSION >= 11000
-}
-#endif
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::TRSM(CBLAS_SIDE side,
-                                 CBLAS_UPLO uplo,
-                                 CBLAS_TRANSPOSE transA,
-                                 CBLAS_DIAG diag,
-                                 int M,
-                                 int N,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 T *B,
-                                 int ldb) const {
-  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
-  // where ' stands for transpose
-  cublasSideMode_t cuSide =
-      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
-  cublasFillMode_t cuUplo =
-      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasDiagType_t cuDiag =
-      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::TRSM(
-        handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGETRF(
-    int n, T **a, int *ipiv, int *info, int batch_size) const {
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGETRI(int n,
-                                         const T **a,
-                                         const int *ipiv,
-                                         T **a_inv,
-                                         int *info,
-                                         int batch_size) const {
-  PADDLE_ENFORCE_NE(
-      a_inv,
-      a,
-      phi::errors::InvalidArgument(
-          "cuBLAS fuction 'cublas<S/D>getrfBatched' cannot be executed "
-          "in-place. The memory space of output matrix (address: %p) cannot "
-          "overlap memory space of input matrix (address: %p).",
-          a_inv,
-          a));
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedMatInv(
-    int n, const T **a, T **a_inv, int *info, int batch_size) const {
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedGETRS(CBLAS_TRANSPOSE trans,
-                                         int n,
-                                         int nrhs,
-                                         const T **a,
-                                         int lda,
-                                         int *ipiv,
-                                         T **b,
-                                         int ldb,
-                                         int *info,
-                                         int batch_size) const {
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  cublasOperation_t cuTrans =
-      (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::GETRS_BATCH(
-        handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size);
-  });
-}
-
-template <>
-template <typename T>
-void Blas<phi::GPUContext>::BatchedTRSM(CBLAS_SIDE side,
-                                        CBLAS_UPLO uplo,
-                                        CBLAS_TRANSPOSE transA,
-                                        CBLAS_DIAG diag,
-                                        int M,
-                                        int N,
-                                        T alpha,
-                                        const T **A,
-                                        int lda,
-                                        T **B,
-                                        int ldb,
-                                        int batch_size) const {
-  // solve row major `op ( A ) X = α B` by taking it as `X' op ( A' )  =  α B'`
-  // where ' stands for transpose
-  cublasSideMode_t cuSide =
-      (side == CblasLeft) ? CUBLAS_SIDE_RIGHT : CUBLAS_SIDE_LEFT;
-  cublasFillMode_t cuUplo =
-      (uplo == CblasLower) ? CUBLAS_FILL_MODE_UPPER : CUBLAS_FILL_MODE_LOWER;
-  // use CUBLAS_OP_C (conjugate transpose) for complex
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasDiagType_t cuDiag =
-      (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT;
-
-  dev_ctx_.CublasCall([&](cublasHandle_t handle) {
-    CUBlas<T>::TRSM_BATCH(handle,
-                          cuSide,
-                          cuUplo,
-                          cuTransA,
-                          cuDiag,
-                          N,
-                          M,
-                          &alpha,
-                          A,
-                          lda,
-                          B,
-                          ldb,
-                          batch_size);
-  });
-}
-
-}  // namespace funcs
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/blas_impl.h b/backends/metax_gpu/kernels/funcs/blas/blas_impl.h
deleted file mode 100644
index cb59d73bef8..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blas_impl.h
+++ /dev/null
@@ -1,2003 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include <algorithm>
-#include <cmath>
-#include <limits>
-#include <vector>
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/complex.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define INT_MAX_VALUE 2147483647
-
-namespace phi {
-namespace funcs {
-
-namespace detail {
-template <typename T>
-static void axpy(
-    int n, const T alpha, const T *x, const int incx, T *y, const int incy) {
-  // Y = Y + alpha * X
-  while (n-- > 0) {
-    *y += alpha * *x;
-    y = y + incy;
-    x = x + incx;
-  }
-}
-}  // namespace detail
-
-template <typename T>
-struct CBlas;
-
-template <>
-struct CBlas<int8_t> {
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Blas VCOPY do not supported on CPU, please check your code"));
-  }
-};
-
-template <>
-struct CBlas<int16_t> {
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Blas VCOPY do not supported on CPU, please check your code"));
-  }
-};
-
-template <>
-struct CBlas<phi::dtype::bfloat16> {
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    detail::axpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args UNUSED) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Blas VCOPY do not supported on CPU with bfloat16,"
-        " please check your code"));
-  }
-
-  template <typename... ARGS>
-  static void VADD(int n,
-                   const phi::dtype::bfloat16 *x,
-                   const phi::dtype::bfloat16 *y,
-                   phi::dtype::bfloat16 *z) {
-    for (int i = 0; i < n; ++i) {
-      z[i] = x[i] + y[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VMUL(int n,
-                   const phi::dtype::bfloat16 *x,
-                   const phi::dtype::bfloat16 *y,
-                   phi::dtype::bfloat16 *z) {
-    for (int i = 0; i < n; ++i) {
-      z[i] = x[i] * y[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VSUB(int n,
-                   const phi::dtype::bfloat16 *x,
-                   const phi::dtype::bfloat16 *y,
-                   phi::dtype::bfloat16 *z) {
-    for (int i = 0; i < n; ++i) {
-      z[i] = x[i] - y[i];
-    }
-  }
-};
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-struct CBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    phi::dynload::cblas_sgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static float *GEMM_ALLOC(ARGS... args) {
-    return phi::dynload::cblas_sgemm_alloc(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_PACK(ARGS... args) {
-    phi::dynload::cblas_sgemm_pack(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_COMPUTE(ARGS... args) {
-    phi::dynload::cblas_sgemm_compute(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_FREE(ARGS... args) {
-    phi::dynload::cblas_sgemm_free(args...);
-  }
-
-#ifdef PADDLE_WITH_LIBXSMM
-  template <typename... ARGS>
-  static void SMM_GEMM(ARGS... args) {
-    libxsmm_sgemm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    phi::dynload::cblas_saxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_scopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    phi::dynload::cblas_sgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static float DOT(ARGS... args) {
-    return phi::dynload::cblas_sdot(args...);
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    phi::dynload::cblas_sscal(args...);
-  }
-
-  template <typename... ARGS>
-  static float ASUM(ARGS... args) {
-    return phi::dynload::cblas_sasum(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    phi::dynload::cblas_sgemm_batch(args...);
-  }
-
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    phi::dynload::vsAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSUB(ARGS... args) {
-    phi::dynload::vsSub(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    phi::dynload::vsMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VDIV(ARGS... args) {
-    phi::dynload::vsDiv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VEXP(ARGS... args) {
-    phi::dynload::vsExp(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSQUARE(ARGS... args) {
-    phi::dynload::vsSqr(args...);
-  }
-
-  template <typename... ARGS>
-  static void VPOW(ARGS... args) {
-    phi::dynload::vsPowx(args...);
-  }
-
-  template <typename... ARGS>
-  static void VINV(ARGS... args) {
-    phi::dynload::vsInv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMERF(ARGS... args) {
-    phi::dynload::vmsErf(args...);
-  }
-#if !defined(_WIN32)
-  template <typename... ARGS>
-  static void CSRMM(ARGS... args) {
-    phi::dynload::mkl_scsrmm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    phi::dynload::cblas_strsm(args...);
-  }
-};
-
-template <>
-struct CBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    phi::dynload::cblas_dgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static double *GEMM_ALLOC(ARGS... args) {
-    return phi::dynload::cblas_dgemm_alloc(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_PACK(ARGS... args) {
-    phi::dynload::cblas_dgemm_pack(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_COMPUTE(ARGS... args) {
-    phi::dynload::cblas_dgemm_compute(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_FREE(ARGS... args) {
-    phi::dynload::cblas_dgemm_free(args...);
-  }
-
-#ifdef PADDLE_WITH_LIBXSMM
-  template <typename... ARGS>
-  static void SMM_GEMM(ARGS... args) {
-    libxsmm_dgemm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    phi::dynload::cblas_daxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_dcopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    phi::dynload::cblas_dgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static double DOT(ARGS... args) {
-    return phi::dynload::cblas_ddot(args...);
-  }
-
-  template <typename... ARGS>
-  static void SCAL(ARGS... args) {
-    phi::dynload::cblas_dscal(args...);
-  }
-
-  template <typename... ARGS>
-  static double ASUM(ARGS... args) {
-    return phi::dynload::cblas_dasum(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(ARGS... args) {
-    phi::dynload::cblas_dgemm_batch(args...);
-  }
-
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    phi::dynload::vdAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSUB(ARGS... args) {
-    phi::dynload::vdSub(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    phi::dynload::vdMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VDIV(ARGS... args) {
-    phi::dynload::vdDiv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VEXP(ARGS... args) {
-    phi::dynload::vdExp(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSQUARE(ARGS... args) {
-    phi::dynload::vdSqr(args...);
-  }
-
-  template <typename... ARGS>
-  static void VPOW(ARGS... args) {
-    phi::dynload::vdPowx(args...);
-  }
-
-  template <typename... ARGS>
-  static void VINV(ARGS... args) {
-    phi::dynload::vdInv(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMERF(ARGS... args) {
-    phi::dynload::vmdErf(args...);
-  }
-#if !defined(_WIN32)
-  template <typename... ARGS>
-  static void CSRMM(ARGS... args) {
-    phi::dynload::mkl_dcsrmm(args...);
-  }
-#endif
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    phi::dynload::cblas_dtrsm(args...);
-  }
-};
-
-template <>
-struct CBlas<phi::dtype::complex<float>> {
-  template <typename... ARGS>
-  static void AXPY(int n,
-                   const phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *X,
-                   const int incX,
-                   phi::dtype::complex<float> *Y,
-                   const int incY) {
-    phi::dynload::cblas_caxpy(n, &alpha, X, incX, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_ccopy(args...);
-  }
-
-  // the libmklml_intel.so paddle used has no vcAdd, vcSub,
-  // vcMul, vcDiv apis before rebuild from source
-  // so replace with the raw operator methods
-  /*
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    phi::dynload::vcAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSUB(ARGS... args) {
-    phi::dynload::vcSub(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    phi::dynload::vcMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VDIV(ARGS... args) {
-    phi::dynload::vcDiv(args...);
-  }
-  */
-
-  template <typename... ARGS>
-  static void VADD(int n,
-                   const phi::dtype::complex<float> *a,
-                   const phi::dtype::complex<float> *b,
-                   phi::dtype::complex<float> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] + b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VSUB(int n,
-                   const phi::dtype::complex<float> *a,
-                   const phi::dtype::complex<float> *b,
-                   phi::dtype::complex<float> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] - b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VMUL(int n,
-                   const phi::dtype::complex<float> *a,
-                   const phi::dtype::complex<float> *b,
-                   phi::dtype::complex<float> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] * b[i];
-    }
-  }
-  template <typename... ARGS>
-  static void VDIV(int n,
-                   const phi::dtype::complex<float> *a,
-                   const phi::dtype::complex<float> *b,
-                   phi::dtype::complex<float> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] / b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void GEMV(CBLAS_LAYOUT layout,
-                   CBLAS_TRANSPOSE trans,
-                   int M,
-                   int N,
-                   phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   const phi::dtype::complex<float> *X,
-                   int incx,
-                   phi::dtype::complex<float> beta,
-                   phi::dtype::complex<float> *Y,
-                   int incy) {
-    const void *a_ = (const void *)(A);
-    const void *x_ = (const void *)(X);
-    void *y_ = static_cast<void *>(Y);
-    phi::dynload::cblas_cgemv(
-        layout, trans, M, N, &alpha, a_, lda, x_, incx, &beta, y_, incy);
-  }
-
-  template <typename... ARGS>
-  static void GEMM(CBLAS_LAYOUT layout,
-                   CBLAS_TRANSPOSE trans_a,
-                   CBLAS_TRANSPOSE trans_b,
-                   int M,
-                   int N,
-                   int K,
-                   phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   const phi::dtype::complex<float> *B,
-                   int ldb,
-                   phi::dtype::complex<float> beta,
-                   phi::dtype::complex<float> *C,
-                   int ldc) {
-    const void *a_ = (const void *)(A);
-    const void *b_ = (const void *)(B);
-    void *c_ = static_cast<void *>(C);
-    phi::dynload::cblas_cgemm(layout,
-                              trans_a,
-                              trans_b,
-                              M,
-                              N,
-                              K,
-                              &alpha,
-                              a_,
-                              lda,
-                              b_,
-                              ldb,
-                              &beta,
-                              c_,
-                              ldc);
-  }
-
-  static void TRSM(CBLAS_LAYOUT layout,
-                   CBLAS_SIDE side,
-                   CBLAS_UPLO uplo,
-                   CBLAS_TRANSPOSE trans_a,
-                   CBLAS_DIAG diag,
-                   int M,
-                   int N,
-                   phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   int lda,
-                   phi::dtype::complex<float> *B,
-                   int ldb) {
-    const void *a_ = (const void *)(A);
-    void *b_ = static_cast<void *>(B);
-    phi::dynload::cblas_ctrsm(
-        layout, side, uplo, trans_a, diag, M, N, &alpha, a_, lda, b_, ldb);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(CBLAS_LAYOUT layout,
-                         CBLAS_TRANSPOSE *trans_a,
-                         CBLAS_TRANSPOSE *trans_b,
-                         int *M,
-                         int *N,
-                         int *K,
-                         phi::dtype::complex<float> *alpha,
-                         const phi::dtype::complex<float> **A,
-                         const int *lda,
-                         const phi::dtype::complex<float> **B,
-                         const int *ldb,
-                         phi::dtype::complex<float> *beta,
-                         phi::dtype::complex<float> **C,
-                         const int *ldc,
-                         int group_count,
-                         int *group_size) {
-    const void **A_void = (const void **)(&(*A));
-    const void **B_void = (const void **)(&(*B));
-    void **C_void = reinterpret_cast<void **>(C);
-
-    phi::dynload::cblas_cgemm_batch(layout,
-                                    trans_a,
-                                    trans_b,
-                                    M,
-                                    N,
-                                    K,
-                                    alpha,
-                                    A_void,
-                                    lda,
-                                    B_void,
-                                    ldb,
-                                    beta,
-                                    C_void,
-                                    ldc,
-                                    group_count,
-                                    group_size);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_EX(ARGS... args) {
-    phi::dynload::cblas_cgemm_batch(args...);
-  }
-};
-
-template <>
-struct CBlas<phi::dtype::complex<double>> {
-  template <typename... ARGS>
-  static void AXPY(int n,
-                   const phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *X,
-                   const int incX,
-                   phi::dtype::complex<double> *Y,
-                   const int incY) {
-    phi::dynload::cblas_zaxpy(n, &alpha, X, incX, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    phi::dynload::cblas_zcopy(args...);
-  }
-
-  // the libmklml_intel.so paddle used has no vzAdd, vzSub,
-  // vzMul, vzDiv apis before rebuild from source
-  // so replace with the raw operator methods
-  /*
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    phi::dynload::vzAdd(args...);
-  }
-
-  template <typename... ARGS>
-  static void VSUB(ARGS... args) {
-    phi::dynload::vzSub(args...);
-  }
-
-  template <typename... ARGS>
-  static void VMUL(ARGS... args) {
-    phi::dynload::vzMul(args...);
-  }
-
-  template <typename... ARGS>
-  static void VDIV(ARGS... args) {
-    phi::dynload::vzDiv(args...);
-  }
-  */
-
-  template <typename... ARGS>
-  static void VADD(int n,
-                   const phi::dtype::complex<double> *a,
-                   const phi::dtype::complex<double> *b,
-                   phi::dtype::complex<double> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] + b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VSUB(int n,
-                   const phi::dtype::complex<double> *a,
-                   const phi::dtype::complex<double> *b,
-                   phi::dtype::complex<double> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] - b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void VMUL(int n,
-                   const phi::dtype::complex<double> *a,
-                   const phi::dtype::complex<double> *b,
-                   phi::dtype::complex<double> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] * b[i];
-    }
-  }
-  template <typename... ARGS>
-  static void VDIV(int n,
-                   const phi::dtype::complex<double> *a,
-                   const phi::dtype::complex<double> *b,
-                   phi::dtype::complex<double> *y) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = a[i] / b[i];
-    }
-  }
-
-  template <typename... ARGS>
-  static void GEMV(CBLAS_LAYOUT layout,
-                   CBLAS_TRANSPOSE trans,
-                   int M,
-                   int N,
-                   phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   const phi::dtype::complex<double> *X,
-                   int incx,
-                   phi::dtype::complex<double> beta,
-                   phi::dtype::complex<double> *Y,
-                   int incy) {
-    const void *a_ = (const void *)(A);
-    const void *x_ = (const void *)(X);
-    void *y_ = static_cast<void *>(Y);
-    phi::dynload::cblas_zgemv(
-        layout, trans, M, N, &alpha, a_, lda, x_, incx, &beta, y_, incy);
-  }
-
-  template <typename... ARGS>
-  static void GEMM(CBLAS_LAYOUT layout,
-                   CBLAS_TRANSPOSE trans_a,
-                   CBLAS_TRANSPOSE trans_b,
-                   int M,
-                   int N,
-                   int K,
-                   phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   const phi::dtype::complex<double> *B,
-                   int ldb,
-                   phi::dtype::complex<double> beta,
-                   phi::dtype::complex<double> *C,
-                   int ldc) {
-    const void *a_ = (const void *)(A);
-    const void *b_ = (const void *)(B);
-    void *c_ = static_cast<void *>(C);
-    phi::dynload::cblas_zgemm(layout,
-                              trans_a,
-                              trans_b,
-                              M,
-                              N,
-                              K,
-                              &alpha,
-                              a_,
-                              lda,
-                              b_,
-                              ldb,
-                              &beta,
-                              c_,
-                              ldc);
-  }
-
-  static void TRSM(CBLAS_LAYOUT layout,
-                   CBLAS_SIDE side,
-                   CBLAS_UPLO uplo,
-                   CBLAS_TRANSPOSE trans_a,
-                   CBLAS_DIAG diag,
-                   int M,
-                   int N,
-                   phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   int lda,
-                   phi::dtype::complex<double> *B,
-                   int ldb) {
-    const void *a_ = (const void *)(A);
-    void *b_ = static_cast<void *>(B);
-    phi::dynload::cblas_ztrsm(
-        layout, side, uplo, trans_a, diag, M, N, &alpha, a_, lda, b_, ldb);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_BATCH(CBLAS_LAYOUT layout,
-                         CBLAS_TRANSPOSE *trans_a,
-                         CBLAS_TRANSPOSE *trans_b,
-                         int *M,
-                         int *N,
-                         int *K,
-                         phi::dtype::complex<double> *alpha,
-                         const phi::dtype::complex<double> **A,
-                         const int *lda,
-                         const phi::dtype::complex<double> **B,
-                         const int *ldb,
-                         phi::dtype::complex<double> *beta,
-                         phi::dtype::complex<double> **C,
-                         const int *ldc,
-                         int group_count,
-                         int *group_size) {
-    const void **A_void = (const void **)(&(*A));
-    const void **B_void = (const void **)(&(*B));
-    void **C_void = reinterpret_cast<void **>(C);
-
-    phi::dynload::cblas_zgemm_batch(layout,
-                                    trans_a,
-                                    trans_b,
-                                    M,
-                                    N,
-                                    K,
-                                    alpha,
-                                    A_void,
-                                    lda,
-                                    B_void,
-                                    ldb,
-                                    beta,
-                                    C_void,
-                                    ldc,
-                                    group_count,
-                                    group_size);
-  }
-
-  template <typename... ARGS>
-  static void GEMM_EX(ARGS... args) {
-    phi::dynload::cblas_zgemm_batch(args...);
-  }
-};
-
-#else
-
-template <>
-struct CBlas<float> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    cblas_sgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    cblas_saxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_scopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    cblas_sgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    cblas_strsm(args...);
-  }
-};
-
-template <>
-struct CBlas<double> {
-  template <typename... ARGS>
-  static void GEMM(ARGS... args) {
-    cblas_dgemm(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(ARGS... args) {
-    cblas_daxpy(args...);
-  }
-
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_dcopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(ARGS... args) {
-    cblas_dgemv(args...);
-  }
-
-  template <typename... ARGS>
-  static void TRSM(ARGS... args) {
-    cblas_dtrsm(args...);
-  }
-};
-
-template <>
-struct CBlas<phi::dtype::complex<float>> {
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_ccopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(int n,
-                   const phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *X,
-                   const int incX,
-                   phi::dtype::complex<float> *Y,
-                   const int incY) {
-    cblas_caxpy(n, &alpha, X, incX, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(const CBLAS_LAYOUT layout,
-                   const CBLAS_TRANSPOSE TransA,
-                   const int M,
-                   const int N,
-                   const phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   const int lda,
-                   const phi::dtype::complex<float> *X,
-                   const int incX,
-                   const phi::dtype::complex<float> beta,
-                   phi::dtype::complex<float> *Y,
-                   const int incY) {
-    cblas_cgemv(layout, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void GEMM(const CBLAS_LAYOUT layout,
-                   const CBLAS_TRANSPOSE TransA,
-                   const CBLAS_TRANSPOSE TransB,
-                   const int M,
-                   const int N,
-                   const int K,
-                   const phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   const int lda,
-                   const phi::dtype::complex<float> *B,
-                   const int ldb,
-                   const phi::dtype::complex<float> beta,
-                   phi::dtype::complex<float> *C,
-                   const int ldc) {
-    cblas_cgemm(
-        layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
-  }
-
-  static void TRSM(const CBLAS_LAYOUT layout,
-                   const CBLAS_SIDE side,
-                   const CBLAS_UPLO uplo,
-                   const CBLAS_TRANSPOSE transA,
-                   const CBLAS_DIAG diag,
-                   const int M,
-                   const int N,
-                   const phi::dtype::complex<float> alpha,
-                   const phi::dtype::complex<float> *A,
-                   const int lda,
-                   phi::dtype::complex<double> *B,
-                   const int ldb) {
-    cblas_ctrsm(layout, side, uplo, transA, diag, M, N, &alpha, A, lda, B, ldb);
-  }
-};
-
-template <>
-struct CBlas<phi::dtype::complex<double>> {
-  template <typename... ARGS>
-  static void VCOPY(ARGS... args) {
-    cblas_zcopy(args...);
-  }
-
-  template <typename... ARGS>
-  static void AXPY(int n,
-                   const phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *X,
-                   const int incX,
-                   phi::dtype::complex<double> *Y,
-                   const int incY) {
-    cblas_zaxpy(n, &alpha, X, incX, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void GEMV(const CBLAS_LAYOUT layout,
-                   const CBLAS_TRANSPOSE TransA,
-                   const int M,
-                   const int N,
-                   const phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   const int lda,
-                   const phi::dtype::complex<double> *X,
-                   const int incX,
-                   const phi::dtype::complex<double> beta,
-                   phi::dtype::complex<double> *Y,
-                   const int incY) {
-    cblas_zgemv(layout, TransA, M, N, &alpha, A, lda, X, incX, &beta, Y, incY);
-  }
-
-  template <typename... ARGS>
-  static void GEMM(const CBLAS_LAYOUT layout,
-                   const CBLAS_TRANSPOSE TransA,
-                   const CBLAS_TRANSPOSE TransB,
-                   const int M,
-                   const int N,
-                   const int K,
-                   const phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   const int lda,
-                   const phi::dtype::complex<double> *B,
-                   const int ldb,
-                   const phi::dtype::complex<double> beta,
-                   phi::dtype::complex<double> *C,
-                   const int ldc) {
-    cblas_zgemm(
-        layout, TransA, TransB, M, N, K, &alpha, A, lda, B, ldb, &beta, C, ldc);
-  }
-
-  static void TRSM(const CBLAS_LAYOUT layout,
-                   const CBLAS_SIDE side,
-                   const CBLAS_UPLO uplo,
-                   const CBLAS_TRANSPOSE transA,
-                   const CBLAS_DIAG diag,
-                   const int M,
-                   const int N,
-                   const phi::dtype::complex<double> alpha,
-                   const phi::dtype::complex<double> *A,
-                   const int lda,
-                   phi::dtype::complex<double> *B,
-                   const int ldb) {
-    cblas_ztrsm(layout, side, uplo, transA, diag, M, N, &alpha, A, lda, B, ldb);
-  }
-};
-
-#endif
-
-template <>
-struct CBlas<phi::dtype::float16> {
-  static void GEMM(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 GEMM not supported on CPU, please check your code"));
-  }
-
-  static void SMM_GEMM(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 SMM_GEMM not supported on CPU, please check your code"));
-  }
-  static void VMUL(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 VMUL not supported on CPU, please check your code"));
-  }
-  static void VEXP(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 VEXP not supported on CPU, please check your code"));
-  }
-  static void VSQUARE(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 VSQUARE not supported on CPU, please check your code"));
-  }
-  static void VPOW(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 VPOW not supported on CPU, please check your code"));
-  }
-  static void DOT(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 DOT not supported on CPU, please check your code"));
-  };
-  static void SCAL(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 SCAL not supported on CPU, please check your code"));
-  };
-  static void ASUM(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 ASUM not supported on CPU, please check your code"));
-  };
-#ifdef PADDLE_WITH_MKLML
-  static void GEMM_BATCH(...) {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "float16 GEMM_BATCH not supported on CPU, please check your code"));
-  }
-#endif
-};
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-template <typename T>
-T *Blas<phi::CPUContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
-                                     const int M,
-                                     const int N,
-                                     const int K) const {
-  return CBlas<T>::GEMM_ALLOC(id, M, N, K);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
-                                      const CBLAS_TRANSPOSE trans,
-                                      int M,
-                                      int N,
-                                      int K,
-                                      const T alpha,
-                                      const T *src,
-                                      const int ld,
-                                      T *dst) const {
-  CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM_COMPUTE(int transA,
-                                         int transB,
-                                         int M,
-                                         int N,
-                                         int K,
-                                         const T *A,
-                                         const int lda,
-                                         const T *B,
-                                         const int ldb,
-                                         T beta,
-                                         T *C,
-                                         const int ldc) const {
-  CBlas<T>::GEMM_COMPUTE(
-      CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM_FREE(T *data) const {
-  CBlas<T>::GEMM_FREE(data);
-}
-#endif
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int64_t M,
-                                 int64_t N,
-                                 int64_t K,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-    PADDLE_THROW(
-        common::errors::Unimplemented("GEMM not supported for large tensor "
-                                      "size on CPU, please check your code!"));
-  }
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
-
-template <>
-template <typename T, typename U>
-void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int64_t M,
-                                 int64_t N,
-                                 int64_t K,
-                                 U alpha,
-                                 const T *A,
-                                 const T *B,
-                                 U beta,
-                                 T *C) const {
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-    PADDLE_THROW(
-        common::errors::Unimplemented("GEMM not supported for large tensor "
-                                      "size on CPU, please check your code!"));
-  }
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 static_cast<int>(M),
-                 static_cast<int>(N),
-                 static_cast<int>(K),
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM(bool transA,
-                                 bool transB,
-                                 int M,
-                                 int N,
-                                 int K,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 const T *B,
-                                 int ldb,
-                                 T beta,
-                                 T *C,
-                                 int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA == false ? CblasNoTrans : CblasTrans,
-                 transB == false ? CblasNoTrans : CblasTrans,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                 CBLAS_TRANSPOSE transB,
-                                 int M,
-                                 int N,
-                                 int K,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 const T *B,
-                                 int ldb,
-                                 T beta,
-                                 T *C,
-                                 int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMul(const phi::DenseTensor &mat_a,
-                                 bool trans_a,
-                                 const phi::DenseTensor &mat_b,
-                                 bool trans_b,
-                                 T alpha,
-                                 phi::DenseTensor *mat_out,
-                                 T beta) const {
-  const auto &dim_a = mat_a.dims();
-  const auto &dim_b = mat_b.dims();
-  const auto &dim_out = mat_out->dims();
-  PADDLE_ENFORCE_EQ(
-      dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-      true,
-      phi::errors::InvalidArgument(
-          "The input and output of matmul should be matrix, the dim size must "
-          "be 2,"
-          "but received dim size input_a:%d, input_b:%d, output:%d",
-          dim_a.size(),
-          dim_b.size(),
-          dim_out.size()));
-  PADDLE_ENFORCE_EQ(
-      mat_a.place() == mat_b.place() && mat_a.place() == mat_out->place(),
-      true,
-      phi::errors::InvalidArgument("The places of matrices in the matmul "
-                                   "should be same, please check your "
-                                   "code."));
-
-  int M = dim_out[0];
-  int N = dim_out[1];
-  int K = !trans_a ? dim_a[1] : dim_a[0];
-
-  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = !trans_b ? CblasNoTrans : CblasTrans;
-
-  this->GEMM(transA,
-             transB,
-             M,
-             N,
-             K,
-             alpha,
-             mat_a.data<T>(),
-             mat_b.data<T>(),
-             beta,
-             mat_out->data<T>());
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
-  CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VCOPY(int n, const T *x, T *y) const {
-  CBlas<T>::VCOPY(n, x, 1, y, 1);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VADD(int n, const T *x, const T *y, T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VADD(n, x, y, z);
-#else
-  if (x == z) {
-    this->template AXPY<T>(n, (T)(1.), y, z);
-  } else {
-    this->template VCOPY<T>(n, y, z);
-    this->template AXPY<T>(n, (T)(1.), x, z);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VSUB(int n, const T *x, const T *y, T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSUB(n, x, y, z);
-#else
-  // try to find if openblas support vsub
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] - y[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VMUL(int n, const T *x, const T *y, T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMUL(n, x, y, z);
-#else
-  // try to find if openblas support vmul
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VDIV(int n, const T *x, const T *y, T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VDIV(n, x, y, z);
-#else
-  // try to find if openblas support vdiv
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] / y[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VEXP(int n, const T *x, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VEXP(n, x, y);
-#else
-  // try to find if openblas support vexp
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VSQUARE(int n, const T *x, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSQUARE(n, x, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] * x[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VPOW(int n, const T *x, T a, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VPOW(n, x, a, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::pow(x[i], a);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-T Blas<phi::CPUContext>::DOT(int n, const T *x, const T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  return CBlas<T>::DOT(n, x, 1, y, 1);
-#else
-  // try to find if openblas support cblas_dot
-  T sum = 0;
-  for (int i = 0; i < n; ++i) {
-    sum += x[i] * y[i];
-  }
-  return sum;
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::SCAL(int n, const T a, T *x) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::SCAL(n, a, x, 1);
-#else
-  // try to find if openblas support cblas_scal
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-T Blas<phi::CPUContext>::ASUM(int n, T *x, int inc) const {
-  auto sum = static_cast<T>(0.0);
-#ifdef PADDLE_WITH_MKLML
-  sum = CBlas<T>::ASUM(n, x, inc);
-#else
-  // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
-  for (int c = 0; c < n; ++c) {
-    sum += x[c];
-  }
-#endif
-  return sum;
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::GEMV(bool trans_a,
-                                 int M,
-                                 int N,
-                                 T alpha,
-                                 const T *A,
-                                 const T *B,
-                                 T beta,
-                                 T *C) const {
-  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
-  CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int64_t M,
-                                        int64_t N,
-                                        int64_t K,
-                                        T alpha,
-                                        const T *A,
-                                        const T *B,
-                                        T beta,
-                                        T *C,
-                                        int64_t batchCount,
-                                        int64_t strideA,
-                                        int64_t strideB) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      A, phi::errors::InvalidArgument("Pointer A should not be null."));
-  PADDLE_ENFORCE_NOT_NULL(
-      B, phi::errors::InvalidArgument("Pointer B should not be null."));
-  PADDLE_ENFORCE_NOT_NULL(
-      C, phi::errors::InvalidArgument("Pointer C should not be null."));
-
-  if (M > INT_MAX_VALUE || N > INT_MAX_VALUE || K > INT_MAX_VALUE) {
-    PADDLE_THROW(
-        common::errors::Unimplemented("CPU GEMM not supported for large tensor "
-                                      "size."));
-  }
-
-#ifdef PADDLE_WITH_MKLML
-  if (batchCount > INT_MAX_VALUE) {
-    PADDLE_THROW(common::errors::Unimplemented(
-        "CPU GEMM not supported for large batch size in MKLML."));
-  }
-
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  auto a_array = std::vector<const T *>(batchCount);
-  auto b_array = std::vector<const T *>(batchCount);
-  auto c_array = std::vector<T *>(batchCount);
-  for (int k = 0; k < batchCount; ++k) {
-    a_array[k] = &A[k * strideA];
-    b_array[k] = &B[k * strideB];
-    c_array[k] = &C[k * M * N];
-  }
-
-  CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                       &transA,
-                       &transB,
-                       reinterpret_cast<int *>(&M),
-                       reinterpret_cast<int *>(&N),
-                       reinterpret_cast<int *>(&K),
-                       &alpha,
-                       a_array.data(),
-                       &lda,
-                       b_array.data(),
-                       &ldb,
-                       &beta,
-                       c_array.data(),
-                       &ldc,
-                       1 /* group_count */,
-                       reinterpret_cast<int *>(&batchCount));
-#else
-  for (int k = 0; k < batchCount; ++k) {
-    auto *Ak = &A[k * strideA];
-    auto *Bk = &B[k * strideB];
-    auto *Ck = &C[k * M * N];
-    this->template GEMM<T>(transA,
-                           transB,
-                           reinterpret_cast<int *>(M),
-                           reinterpret_cast<int *>(N),
-                           reinterpret_cast<int *>(K),
-                           alpha,
-                           Ak,
-                           Bk,
-                           beta,
-                           Ck);
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
-                                        CBLAS_TRANSPOSE transB,
-                                        int M,
-                                        int N,
-                                        int K,
-                                        T alpha,
-                                        const T **A,
-                                        const T **B,
-                                        T beta,
-                                        T **C,
-                                        int batchCount) const {
-#ifdef PADDLE_WITH_MKLML
-  const int lda = (std::max)((transA == CblasNoTrans) ? K : M, 1);
-  const int ldb = (std::max)((transB == CblasNoTrans) ? N : K, 1);
-  const int ldc = (std::max)(N, 1);
-  CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                       &transA,
-                       &transB,
-                       &M,
-                       &N,
-                       &K,
-                       &alpha,
-                       A,
-                       &lda,
-                       B,
-                       &ldb,
-                       &beta,
-                       C,
-                       &ldc,
-                       1 /* group_count */,
-                       &batchCount);
-#else
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<T>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-#endif
-}
-
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)  // @{ Group Blas MKLML: BatchedGEMMWithHead
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
-                                                CBLAS_TRANSPOSE transB,
-                                                int W1,
-                                                int H1,
-                                                int W2,
-                                                int H2,
-                                                T alpha,
-                                                const T *A,
-                                                const T *B,
-                                                T beta,
-                                                T *C,
-                                                int batchCount,
-                                                int64_t strideA,
-                                                int64_t strideB,
-                                                int64_t head_number,
-                                                bool split_b_vertical) const {
-  int lda = (transA == CblasNoTrans) ? W1 : H1;
-  int ldb = (transB == CblasNoTrans) ? W2 : H2;
-  auto a_array = std::vector<const T *>(batchCount);
-  auto b_array = std::vector<const T *>(batchCount);
-  auto c_array = std::vector<T *>(batchCount);
-
-  if (split_b_vertical) {
-    int ldc = W2;
-    int sub_width = W2 / head_number;
-
-    for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W2 / head_number)
-                                : i * (W2 / head_number) * H2;
-      int sub_matC_offset = i * W2 / head_number;
-      for (int k = 0; k < batchCount; ++k) {
-        a_array[k] = &A[k * strideA] + sub_matA_offset;
-        b_array[k] = &B[k * strideB] + sub_matB_offset;
-        c_array[k] = &C[k * H1 * W2] + sub_matC_offset;
-      }
-
-      CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                           &transA,
-                           &transB,
-                           &H1,
-                           &sub_width,
-                           &H2,
-                           &alpha,
-                           a_array.data(),
-                           &lda,
-                           b_array.data(),
-                           &ldb,
-                           &beta,
-                           c_array.data(),
-                           &ldc,
-                           1 /* group_count */,
-                           &batchCount);
-    }
-
-  } else {
-    PADDLE_ENFORCE_EQ(
-        W1,
-        H2,
-        phi::errors::InvalidArgument(
-            "The fisrt matrix width should be same as second matrix height,"
-            "but received fisrt matrix width %d"
-            ", second matrix height %d",
-            W1,
-            H2));
-    int ldc = W2 * head_number;
-    int sub_width = W1 / head_number;
-
-    for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W1 / head_number) * W2
-                                : i * (W1 / head_number);
-      int sub_matC_offset = i * W2;
-      for (int k = 0; k < batchCount; ++k) {
-        a_array[k] = &A[k * strideA] + sub_matA_offset;
-        b_array[k] = &B[k * strideB] + sub_matB_offset;
-        c_array[k] = &C[k * H1 * head_number * W2] + sub_matC_offset;
-      }
-
-      CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                           &transA,
-                           &transB,
-                           &H1,
-                           &W2,
-                           &sub_width,
-                           &alpha,
-                           a_array.data(),
-                           &lda,
-                           b_array.data(),
-                           &ldb,
-                           &beta,
-                           c_array.data(),
-                           &ldc,
-                           1 /* group_count */,
-                           &batchCount);
-    }
-  }
-}
-#endif  // @} End Group Blas MKLML: BatchedGEMMWithHead
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMul(
-    const int M, const int N, const int K, const T *A, const T *B, T *C) const {
-  this->template GEMM<T>(CblasRowMajor,
-                         CblasNoTrans,
-                         CblasNoTrans,
-                         M,
-                         N,
-                         K,
-                         static_cast<T>(1),
-                         A,
-                         K,
-                         B,
-                         N,
-                         static_cast<T>(0),
-                         C,
-                         N);
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::MatMul(
-    const int M, const int N, const int K, const T *A, const T *B, T *C) const {
-#ifdef PADDLE_WITH_LIBXSMM
-  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
-  // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
-
-  // Since the matrix is very small,
-  // so the unit of calculation is already very fast,
-  // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
-  // use xsmm directly.
-  // Note: SMM use ColMajor
-  const char transa = 'N';
-  const char transb = 'N';
-  const T alpha = static_cast<T>(1);
-  const T beta = static_cast<T>(0);
-  CBlas<T>::SMM_GEMM(
-      &transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta, C, &N);
-  return;
-#endif
-
-  CBlas<T>::GEMM(CblasRowMajor,
-                 CblasNoTrans,
-                 CblasNoTrans,
-                 M,
-                 N,
-                 K,
-                 static_cast<T>(1),
-                 A,
-                 K,
-                 B,
-                 N,
-                 static_cast<T>(0),
-                 C,
-                 N);
-}
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMul(const phi::DenseTensor &mat_a,
-                                 const MatDescriptor &dim_a,
-                                 const phi::DenseTensor &mat_b,
-                                 const MatDescriptor &dim_b,
-                                 T alpha,
-                                 phi::DenseTensor *mat_out,
-                                 T beta) const {
-  MatMul(mat_a.data<T>(),
-         dim_a,
-         mat_b.data<T>(),
-         dim_b,
-         alpha,
-         mat_out->data<T>(),
-         beta);
-}
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMul(const T *mat_a,
-                                 const MatDescriptor &dim_a,
-                                 const T *mat_b,
-                                 const MatDescriptor &dim_b,
-                                 T alpha,
-                                 T *mat_out,
-                                 T beta) const {
-  PADDLE_ENFORCE_EQ(
-      dim_a.width_,
-      dim_b.height_,
-      phi::errors::InvalidArgument(
-          "The fisrt matrix width should be same as second matrix height,"
-          "but received fisrt matrix width %d"
-          ", second matrix height %d",
-          dim_a.width_,
-          dim_b.height_));
-
-  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
-  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
-    this->template GEMM<T>(transA,
-                           transB,
-                           dim_a.height_,
-                           dim_b.width_,
-                           dim_a.width_,
-                           alpha,
-                           mat_a,
-                           mat_b,
-                           beta,
-                           mat_out);
-  } else {
-    PADDLE_ENFORCE_EQ(
-        dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
-            dim_b.batch_size_ == 0,
-        true,
-        phi::errors::InvalidArgument(
-            "dim_a.batch_size should be equal to dim_b.batch_size, or "
-            "one of dim_a.batch_size and dim_b.batch_size should be 0. "
-            "But got dim_a.batch_size = %d, dim_b.batch_size = %d.",
-            dim_a.batch_size_,
-            dim_b.batch_size_));
-    this->template BatchedGEMM<T>(
-        transA,
-        transB,
-        dim_a.height_,
-        dim_b.width_,
-        dim_a.width_,
-        alpha,
-        mat_a,
-        mat_b,
-        beta,
-        mat_out,
-        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
-        dim_a.stride_,
-        dim_b.stride_);
-  }
-}
-
-#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
-    !defined(PADDLE_WITH_HIP)
-// @{ Group Blas MKLML: MatMulWithHead
-/*
- * Multiple two matrixes with multiple heads
- *
- * A new parameter, i.e head_number is added compared to normal MatMul.
- * The head_number describes the number of heads a matrix is vertically
- * split.
- *
- * When user calls this API, the multiplication of two big matrixes is split
- * into multiplication of several (head_number_) small matrixes. e.g. if Mat A
- * is [3, 24] and Mat B is [24, 4], when multiple A and B with head_number as
- * 4, Mat A will be split as 4 matrix of [3, 6] and Mat B will be
- * (horizontally) split as 4 matrix of [6, 4]. The result of final matrix
- * will be 4 matrix of [3, 4], i.e. [3, 16].
- * Another example is A is [3, 8], B is [2, 16], head_number is 4. In this
- * case, A will be split as [3, 2], B will be (vertically) split as
- * [2, 4]. The final result will be 4 matrix of 4 matrix of [3,4], i.e. [3, 16]
- */
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::MatMulWithHead(const phi::DenseTensor &mat_a,
-                                         const MatDescriptor &dim_a,
-                                         const phi::DenseTensor &mat_b,
-                                         const MatDescriptor &dim_b,
-                                         T alpha,
-                                         int head_number,
-                                         phi::DenseTensor *mat_out,
-                                         T beta,
-                                         bool mat_b_split_vertical) const {
-  PADDLE_ENFORCE_EQ(
-      dim_a.width_ % head_number,
-      0,
-      phi::errors::InvalidArgument(
-          "The first input width must be some times the head number"
-          "but received first input width %d"
-          ",  head_number %d",
-          dim_a.width_,
-          head_number));
-  PADDLE_ENFORCE_GE(
-      head_number,
-      1,
-      phi::errors::InvalidArgument("The head number should be greater equal 1,"
-                                   "but received head number %d",
-                                   head_number));
-  PADDLE_ENFORCE_LE(
-      head_number,
-      dim_a.width_,
-      phi::errors::InvalidArgument(
-          "The head number should be less equal first input width,"
-          "but received first input width %d"
-          ",  head_number %d",
-          dim_a.width_,
-          head_number));
-  CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
-  CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
-
-  if (mat_b_split_vertical) {
-    PADDLE_ENFORCE_EQ(
-        dim_b.height_,
-        dim_a.width_ / head_number,
-        phi::errors::InvalidArgument(
-            "The second input height should be equal than first input width,"
-            "but received second input height %d, first input width %d",
-            dim_b.height_,
-            dim_a.width_ / head_number));
-    PADDLE_ENFORCE_EQ(
-        dim_a.width_ % head_number,
-        0,
-        phi::errors::InvalidArgument(
-            "The second input width should be some times the head number"
-            "but received second input width %d"
-            ",  head_number %d",
-            dim_b.width_,
-            head_number));
-  }
-
-  if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
-    int lda = !dim_a.trans_ ? dim_a.width_ : dim_a.height_;
-    int ldb = !dim_b.trans_ ? dim_b.width_ : dim_b.height_;
-    int sub_matA_offset;
-    int sub_matB_offset;
-    int sub_matC_offset;
-    int sub_mat_M = dim_a.height_;
-    int sub_mat_N;
-    int sub_mat_K;
-    int ldc;
-
-    for (int i = 0; i < head_number; i++) {
-      sub_matA_offset = dim_a.trans_
-                            ? i * (dim_a.width_ / head_number) * dim_a.height_
-                            : i * (dim_a.width_ / head_number);
-      if (mat_b_split_vertical) {
-        sub_matB_offset = dim_b.trans_
-                              ? i * (dim_b.width_ / head_number) * dim_b.height_
-                              : i * (dim_b.width_ / head_number);
-        sub_matC_offset = i * dim_b.width_ / head_number;
-
-        sub_mat_N = dim_b.width_ / head_number;
-        sub_mat_K = dim_b.height_;
-
-        ldc = dim_b.width_;
-      } else {
-        sub_matB_offset =
-            dim_b.trans_ ? i * (dim_b.height_ / head_number)
-                         : i * (dim_b.height_ / head_number) * dim_b.width_;
-        sub_matC_offset = i * dim_b.width_;
-
-        sub_mat_N = dim_b.width_;
-        sub_mat_K = dim_a.width_ / head_number;
-
-        ldc = head_number * dim_b.width_;
-      }
-
-      this->template GEMM<T>(transA,
-                             transB,
-                             sub_mat_M,
-                             sub_mat_N,
-                             sub_mat_K,
-                             alpha,
-                             mat_a.data<T>() + sub_matA_offset,
-                             lda,
-                             mat_b.data<T>() + sub_matB_offset,
-                             ldb,
-                             beta,
-                             mat_out->data<T>() + sub_matC_offset,
-                             ldc);
-    }
-  } else {
-    PADDLE_ENFORCE_EQ(
-        (dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
-         dim_b.batch_size_ == 0),
-        true,
-        phi::errors::InvalidArgument(
-            "The first input batch size should be equal than second input,"
-            "either two input batch size is 0, but received first input batch "
-            "size"
-            " %d, second input batch size %d",
-            dim_a.batch_size_,
-            dim_b.batch_size_));
-
-    this->template BatchedGEMMWithHead<T>(
-        transA,
-        transB,
-        dim_a.width_,
-        dim_a.height_,
-        dim_b.width_,
-        dim_b.height_,
-        alpha,
-        mat_a.data<T>(),
-        mat_b.data<T>(),
-        beta,
-        mat_out->data<T>(),
-        dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
-        dim_a.stride_,
-        dim_b.stride_,
-        head_number,
-        mat_b_split_vertical);
-  }
-}
-#endif  // @} End Group Blas MKLML: MatMulWithHead
-
-template <typename DeviceContext>
-template <typename T>
-void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VINV(n, a, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = 1.0 / a[i];
-  }
-#endif
-}
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::VMERF(int n, const T *a, T *y, int64_t mode) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMERF(n, a, y, mode);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::erf(a[i]);
-  }
-#endif
-}
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::CSRMM(const char *transa,
-                                  const int *m,
-                                  const int *n,
-                                  const int *k,
-                                  const T *alpha,
-                                  const char *matdescra,
-                                  const T *val,
-                                  const int *indx,
-                                  const int *pntrb,
-                                  const int *pntre,
-                                  const T *b,
-                                  const int *ldb,
-                                  const T *beta,
-                                  T *c,
-                                  const int *ldc) const {
-  CBlas<T>::CSRMM(transa,
-                  m,
-                  n,
-                  k,
-                  alpha,
-                  matdescra,
-                  val,
-                  indx,
-                  pntrb,
-                  pntre,
-                  b,
-                  ldb,
-                  beta,
-                  c,
-                  ldc);
-}
-#endif
-
-template <>
-template <typename T>
-void Blas<phi::CPUContext>::TRSM(CBLAS_SIDE side,
-                                 CBLAS_UPLO uplo,
-                                 CBLAS_TRANSPOSE transA,
-                                 CBLAS_DIAG diag,
-                                 int M,
-                                 int N,
-                                 T alpha,
-                                 const T *A,
-                                 int lda,
-                                 T *B,
-                                 int ldb) const {
-  CBlas<T>::TRSM(
-      CblasRowMajor, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
-}
-
-}  // namespace funcs
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/blaslt_gemm_search.h b/backends/metax_gpu/kernels/funcs/blas/blaslt_gemm_search.h
deleted file mode 100644
index 6dcc56f8569..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blaslt_gemm_search.h
+++ /dev/null
@@ -1,794 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-
-#include <limits>
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include "paddle/common/flags.h"
-#include "paddle/phi/api/include/context_pool.h"
-#include "paddle/phi/backends/dynload/cublasLt.h"
-#include "paddle/phi/backends/gpu/gpu_info.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/common/place.h"
-#include "paddle/phi/core/allocator.h"
-#include "paddle/phi/core/dense_tensor.h"
-
-COMMON_DECLARE_string(cublaslt_device_best_config);
-
-namespace phi {
-namespace funcs {
-namespace cublaslt_internal {
-
-const std::array<int, 9> split_k_candidates = {2, 3, 4, 5, 6, 8, 12, 16, 32};
-
-struct CublasLtAlgoConfig {
-  int m;
-  int n;
-  int k;
-  int algo_id;
-  int swizzle;
-  int custom_option;
-  int tile;
-  int split_k_val;
-  int reduction_scheme;
-  int stages;
-};
-
-struct CublasLtAlgoSelectorParam {
-  float time{0.0};
-  cublasLtMatmulAlgo_t algo;
-  CublasLtAlgoConfig algo_config;
-};
-
-inline bool compare_algo_time(const CublasLtAlgoSelectorParam& param_a,
-                              const CublasLtAlgoSelectorParam& param_b) {
-  return (param_a.time < param_b.time);
-}
-
-class CublasLtAlgoCache {
- public:
-  static CublasLtAlgoCache& Instance() {
-    static CublasLtAlgoCache instance(100 /*search_times*/);
-    return instance;
-  }
-
-  template <typename InT, typename OutT>
-  void RunAndMeasureAlgo(cublasLtHandle_t handle,
-                         cublasLtMatmulDesc_t matmul_desc,
-                         cublasLtMatrixLayout_t a_desc,
-                         cublasLtMatrixLayout_t b_desc,
-                         cublasLtMatrixLayout_t bias_desc,
-                         cublasLtMatrixLayout_t c_desc,
-                         void* alpha,
-                         void* beta,
-                         const InT* a,
-                         const InT* b,
-                         const OutT* bias,
-                         OutT* c,
-                         CublasLtAlgoSelectorParam& param,  // NOLINT
-                         cudaEvent_t& start_event,          // NOLINT
-                         cudaEvent_t& stop_event,           // NOLINT
-                         cudaStream_t stream) {
-    cublasStatus_t status;
-    cublasLtMatmulHeuristicResult_t heuristic_result;
-    status = dynload::cublasLtMatmulAlgoCheck(handle,
-                                              matmul_desc,
-                                              a_desc,
-                                              b_desc,
-                                              bias_desc,
-                                              c_desc,
-                                              &param.algo,
-                                              &heuristic_result);
-    PADDLE_ENFORCE_GPU_SUCCESS(status);
-    if (status != CUBLAS_STATUS_SUCCESS) {
-      param.time = std::numeric_limits<float>::max();
-      return;
-    }
-    size_t workspace_size = heuristic_result.workspaceSize;
-    auto workspace = phi::memory_utils::Alloc(
-        phi::GPUPlace(phi::backends::gpu::GetCurrentDeviceId()),
-        workspace_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(start_event, stream));
-    int repeats = search_times_;
-
-    for (int loop = 0; loop < repeats; loop++) {
-      status = dynload::cublasLtMatmul(handle,
-                                       matmul_desc,
-                                       alpha,
-                                       a,
-                                       a_desc,
-                                       b,
-                                       b_desc,
-                                       beta,
-                                       bias,
-                                       bias_desc,
-                                       c,
-                                       c_desc,
-                                       &param.algo,
-                                       workspace->ptr(),
-                                       workspace_size,
-                                       stream);
-      if (status != CUBLAS_STATUS_SUCCESS) {
-        param.time = std::numeric_limits<float>::max();
-        return;
-      }
-    }
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(stop_event, stream));
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
-
-    float time;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaEventElapsedTime(&time, start_event, stop_event));
-
-    param.time = time / repeats;
-  }
-
-  template <typename InT, typename OutT>
-  cublasLtMatmulAlgo_t* CublasLtAlgoSelect(cublasLtHandle_t handle,
-                                           int m,
-                                           int n,
-                                           int k,
-                                           int batch_count,
-                                           const InT* a,
-                                           const InT* b,
-                                           const OutT* bias,
-                                           OutT* c,
-                                           void* alpha,
-                                           void* beta,
-                                           cublasLtMatmulDesc_t matmul_desc,
-                                           cublasLtMatrixLayout_t a_desc,
-                                           cublasLtMatrixLayout_t b_desc,
-                                           cublasLtMatrixLayout_t bias_desc,
-                                           cublasLtMatrixLayout_t c_desc,
-                                           cublasComputeType_t compute_type,
-                                           cudaDataType_t scale_type,
-                                           cudaDataType_t a_type,
-                                           cudaDataType_t b_type,
-                                           cudaDataType_t bias_type,
-                                           cudaDataType_t c_type,
-                                           cudaStream_t stream) {
-    // If we don't have config file and we do not search, here return nullptr
-    if (!has_config_file_ && search_times_ <= 0) {
-      return nullptr;
-    }
-
-    // VLOG(0) << "m n k: " << m << " " << n << " " << k;
-
-    int64_t seed = 0;
-    std::hash<int64_t> hash_fn;
-
-    HashMatmulDesc(matmul_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(a_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(b_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(bias_desc, &seed, hash_fn);
-    HashMatrixLayoutDesc(c_desc, &seed, hash_fn);
-
-    {
-      std::lock_guard<std::mutex> lock(cache_mutex_);
-      if (algo_caches_.count(seed)) {
-        VLOG(3) << "CublasLtAlgoSelect Found in cache";
-        return &algo_caches_[seed];
-      }
-    }
-
-    if (search_configs_.empty()) {
-      std::ifstream infile;
-      std::string config_file_path = FLAGS_cublaslt_device_best_config;
-      infile.open(config_file_path.c_str());
-      if (infile.is_open()) {
-        size_t workspace_size;
-        float time;
-        char comma;
-        while (!infile.eof()) {
-          CublasLtAlgoConfig search_config;
-          infile >> search_config.m >> comma >> search_config.k >> comma >>
-              search_config.n >> comma >> search_config.algo_id >> comma >>
-              search_config.swizzle >> comma >> search_config.custom_option >>
-              comma >> search_config.tile >> comma >>
-              search_config.split_k_val >> comma >>
-              search_config.reduction_scheme >> comma >> search_config.stages >>
-              comma >> workspace_size >> comma >> time;
-          search_configs_.push_back(search_config);
-        }
-        infile.close();
-        VLOG(3) << "Loaded " << search_configs_.size() << " configs";
-      }
-    }
-    if (!search_configs_.empty()) {
-      auto configure_algo = [&](const CublasLtAlgoConfig& search_config)
-          -> cublasLtMatmulAlgo_t* {
-        cublasLtMatmulAlgo_t algo;
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoInit(handle,
-                                            compute_type,
-                                            scale_type,
-                                            b_type,
-                                            a_type,
-                                            c_type,
-                                            c_type,
-                                            search_config.algo_id,
-                                            &algo));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
-                &search_config.custom_option,
-                sizeof(search_config.custom_option)));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_TILE_ID,
-                &search_config.tile,
-                sizeof(search_config.tile)));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                &search_config.split_k_val,
-                sizeof(search_config.split_k_val)));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING,
-                &search_config.swizzle,
-                sizeof(search_config.swizzle)));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                &search_config.reduction_scheme,
-                sizeof(search_config.reduction_scheme)));
-        PADDLE_ENFORCE_GPU_SUCCESS(
-            dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                &algo,
-                CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                &search_config.stages,
-                sizeof(search_config.stages)));
-        std::lock_guard<std::mutex> lock(cache_mutex_);
-        algo_caches_[seed] = algo;
-        return &algo_caches_[seed];
-      };
-      const CublasLtAlgoConfig* pre = nullptr;
-      for (size_t i = 0; i < search_configs_.size(); i++) {
-        if (search_configs_[i].n == n && search_configs_[i].k == k &&
-            m <= search_configs_[i].m) {
-          return configure_algo(search_configs_[i]);
-        } else if (search_configs_[i].n == n && search_configs_[i].k == k &&
-                   m > search_configs_[i].m) {
-          if (pre == nullptr || pre->m < search_configs_[i].m)
-            pre = &search_configs_[i];
-        }
-      }
-      if (pre != nullptr) {
-        // use max m in file
-        return configure_algo(*pre);
-      }
-    }
-
-    // if we have cache but not found algo, and we don't want to search,
-    // here return nullptr
-    if (search_times_ <= 0) {
-      return nullptr;
-    }
-
-    VLOG(3) << "CublasLtAlgoSelect Not Found in cache";
-
-    // Get Ids
-    // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoGetIds
-    cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
-    int algo_ids[requested_algo_count_];  // NOLINT
-
-    int num_algo_ids;
-    status = dynload::cublasLtMatmulAlgoGetIds(handle,
-                                               compute_type,
-                                               scale_type,
-                                               a_type,
-                                               b_type,
-                                               bias_type,
-                                               c_type,
-                                               requested_algo_count_,
-                                               algo_ids,
-                                               &num_algo_ids);
-    PADDLE_ENFORCE_GPU_SUCCESS(status);
-
-    // Traverse all possible algo combinations
-    int step = 0;
-    int limit = 20000;
-    std::vector<CublasLtAlgoSelectorParam> params;
-
-    for (int idx = 0; idx < num_algo_ids; idx++) {
-      cublasLtMatmulAlgo_t algo;
-
-      /* Initialize algo structure with given Algp ID */
-      // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoInit
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoInit(handle,
-                                                                 compute_type,
-                                                                 scale_type,
-                                                                 a_type,
-                                                                 b_type,
-                                                                 bias_type,
-                                                                 c_type,
-                                                                 algo_ids[idx],
-                                                                 &algo));
-
-      // Query the tiles enums supported by that algo which is used to alloc
-      // enough space to store it
-      // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoCapGetAttribute
-      size_t attr_size = 0;
-
-      int batch_support;
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT,
-          &batch_support,
-          sizeof(batch_support),
-          &attr_size));
-      if (batch_count > 1 && batch_support == 0) {
-        continue;
-      }
-
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo, CUBLASLT_ALGO_CAP_TILE_IDS, nullptr, 0, &attr_size));
-
-      int num_tiles = static_cast<int>(attr_size / sizeof(int));
-      std::vector<int> tiles(num_tiles == 0 ? 1 : num_tiles);
-      if (num_tiles == 0) {
-        tiles[0] = CUBLASLT_MATMUL_TILE_UNDEFINED;
-        num_tiles = 1;
-      } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-            &algo,
-            CUBLASLT_ALGO_CAP_TILE_IDS,
-            tiles.data(),
-            sizeof(int) * num_tiles,
-            &attr_size));
-      }
-
-      // Query the stages enums supported by that algo (cuda must >= 11.0)
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo, CUBLASLT_ALGO_CAP_STAGES_IDS, nullptr, 0, &attr_size));
-      int num_stages = static_cast<int>(attr_size / sizeof(int));
-      std::vector<int> stages(num_stages == 0 ? 1 : num_stages);
-      if (num_stages == 0) {
-        stages[0] = CUBLASLT_MATMUL_STAGES_UNDEFINED;
-        num_stages = 1;
-      } else {
-        PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-            &algo,
-            CUBLASLT_ALGO_CAP_STAGES_IDS,
-            stages.data(),
-            sizeof(int) * num_stages,
-            &attr_size));
-      }
-
-      // Retrieve Other Algo Capabilities attributes
-      int splitk_support, red_mask, swizzling_max, custom_option_max;
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_SPLITK_SUPPORT,
-          &splitk_support,
-          sizeof(splitk_support),
-          &attr_size));
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK,
-          &red_mask,
-          sizeof(red_mask),
-          &attr_size));
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT,
-          &swizzling_max,
-          sizeof(swizzling_max),
-          &attr_size));
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulAlgoCapGetAttribute(
-          &algo,
-          CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX,
-          &custom_option_max,
-          sizeof(custom_option_max),
-          &attr_size));
-
-      /* Loop over the different tiles */
-      for (int tile_id = 0; tile_id < num_tiles && step < limit; tile_id++) {
-        /* Loop over different stages count */
-        for (int stage_id = 0; stage_id < num_stages && step < limit;
-             stage_id++) {
-          /* Loop over the different custom option if any */
-          for (int custom_option = 0;
-               custom_option <= custom_option_max && step < limit;
-               custom_option++) {
-            /* Loop over the CTAs swizzling support */
-            for (int k = 0; k <= swizzling_max && step < limit; k++) {
-              int splir_k_trial = 0;
-              if (splitk_support) {
-                splir_k_trial +=
-                    sizeof(split_k_candidates) / sizeof(split_k_candidates[0]);
-              }
-
-              for (int l = 0; (l < (1 + splir_k_trial)) && (step < limit);
-                   l++) {
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_TILE_ID,
-                        &tiles[tile_id],
-                        sizeof(tiles[tile_id])));
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_STAGES_ID,
-                        &stages[stage_id],
-                        sizeof(stages[stage_id])));
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
-                        &custom_option,
-                        sizeof(custom_option)));
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING,
-                        &k,
-                        sizeof(k)));
-                int split_k_val = 1;
-                int reduction_scheme = CUBLASLT_REDUCTION_SCHEME_NONE;
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                        &split_k_val,
-                        sizeof(split_k_val)));
-                PADDLE_ENFORCE_GPU_SUCCESS(
-                    dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                        &algo,
-                        CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                        &reduction_scheme,
-                        sizeof(int)));
-                if (l > 0) {  // Split-K case
-                  split_k_val = split_k_candidates[l - 1];
-                  PADDLE_ENFORCE_GPU_SUCCESS(
-                      dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                          &algo,
-                          CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                          &split_k_candidates[l - 1],
-                          sizeof(split_k_candidates[l - 1])));
-                  for (reduction_scheme = 1;
-                       reduction_scheme <
-                           static_cast<int>(CUBLASLT_REDUCTION_SCHEME_MASK) &&
-                       (step < limit);
-                       reduction_scheme = reduction_scheme << 1) {
-                    if (reduction_scheme & red_mask) {
-                      PADDLE_ENFORCE_GPU_SUCCESS(
-                          dynload::cublasLtMatmulAlgoConfigSetAttribute(
-                              &algo,
-                              CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-                              &reduction_scheme,
-                              sizeof(reduction_scheme)));
-
-                      cublasLtMatmulHeuristicResult_t heurResult;
-                      status = dynload::cublasLtMatmulAlgoCheck(handle,
-                                                                matmul_desc,
-                                                                a_desc,
-                                                                b_desc,
-                                                                bias_desc,
-                                                                c_desc,
-                                                                &algo,
-                                                                &heurResult);
-                      if (status == CUBLAS_STATUS_SUCCESS) {
-                        CublasLtAlgoSelectorParam param;
-                        param.algo = algo;
-                        param.algo_config.m = m;
-                        param.algo_config.n = n;
-                        param.algo_config.k = k;
-                        param.algo_config.algo_id = algo_ids[idx];
-                        param.algo_config.tile = tiles[tile_id];
-                        param.algo_config.swizzle = k;
-                        param.algo_config.custom_option = custom_option;
-                        param.algo_config.split_k_val = split_k_val;
-                        param.algo_config.reduction_scheme = reduction_scheme;
-                        param.algo_config.stages = stages[stage_id];
-                        params.emplace_back(param);
-                        step++;
-                      }
-                    }  // end if
-                  }
-                } else {
-                  // Prepare algos
-                  cublasLtMatmulHeuristicResult_t heurResult;
-                  // https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulAlgoCheck
-                  status = dynload::cublasLtMatmulAlgoCheck(handle,
-                                                            matmul_desc,
-                                                            a_desc,
-                                                            b_desc,
-                                                            bias_desc,
-                                                            c_desc,
-                                                            &algo,
-                                                            &heurResult);
-                  if (status == CUBLAS_STATUS_SUCCESS) {
-                    CublasLtAlgoSelectorParam param;
-                    param.algo = algo;
-                    param.algo_config.m = m;
-                    param.algo_config.n = n;
-                    param.algo_config.k = k;
-                    param.algo_config.algo_id = algo_ids[idx];
-                    param.algo_config.tile = tiles[tile_id];
-                    param.algo_config.swizzle = k;
-                    param.algo_config.custom_option = custom_option;
-                    param.algo_config.split_k_val = split_k_val;
-                    param.algo_config.reduction_scheme = reduction_scheme;
-                    param.algo_config.stages = stages[stage_id];
-                    params.emplace_back(param);
-                    step++;
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-    cudaEvent_t start_event;
-    cudaEvent_t stop_event;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&start_event));
-    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreate(&stop_event));
-
-    if (step == 0) {
-      VLOG(3) << "No algo can be used";
-      return nullptr;
-    }
-
-    VLOG(3) << "CublasLtAlgoSelect Start testRun " << step << " "
-            << params.size();
-
-    for (int i = 0; i < step; i++) {
-      RunAndMeasureAlgo(handle,
-                        matmul_desc,
-                        a_desc,
-                        b_desc,
-                        bias_desc,
-                        c_desc,
-                        alpha,
-                        beta,
-                        a,
-                        b,
-                        bias,
-                        c,
-                        params[i],
-                        start_event,
-                        stop_event,
-                        stream);
-    }
-    std::sort(params.begin(), params.end(), compare_algo_time);
-
-    size_t res_id = 0;
-    while (params[res_id].time == 0.0) {
-      res_id++;
-      if (res_id >= params.size()) break;
-    }
-
-    if (res_id >= params.size()) {
-      VLOG(3) << "No algo can be used";
-      return nullptr;
-    }
-
-    VLOG(3) << "algo selected";
-
-    std::lock_guard<std::mutex> lock(cache_mutex_);
-    algo_caches_[seed] = params[res_id].algo;
-    return &algo_caches_[seed];
-  }
-
-  ~CublasLtAlgoCache() { SerializeAlgoCachesToFile(); }
-
- private:
-  std::string algo_caches_file_{"./cublaslt_algo_caches_from_paddle"};
-  std::unordered_map<int64_t, cublasLtMatmulAlgo_t> algo_caches_;
-  std::vector<CublasLtAlgoConfig> search_configs_;
-  int search_times_;
-  static constexpr int requested_algo_count_ = 100;
-  std::mutex cache_mutex_;
-  bool has_config_file_;
-
-  explicit CublasLtAlgoCache(int search_times)
-      : search_times_(search_times), has_config_file_(true) {
-    // Init algo_caches_ from cache file
-    std::ifstream infile;
-    infile.open(algo_caches_file_);
-    if (!infile.is_open()) {
-      has_config_file_ = false;
-      VLOG(3) << "No CublasLtAlgoCache file found";
-      return;
-    }
-    size_t cublaslt_version = 0, real_cublaslt_version = 0;
-    int64_t seed = 0;
-    std::array<uint64_t, 8> algo_data;
-    infile >> cublaslt_version;
-    VLOG(1) << "cublaslt_version " << cublaslt_version;
-
-    if (dynload::cublasLtGetCudartVersion() != cublaslt_version) {
-      LOG(INFO) << algo_caches_file_
-                << " is not compatible with current cublaslt_version "
-                << real_cublaslt_version;
-      return;
-    }
-
-    while (!infile.eof()) {
-      infile >> seed >> algo_data[0] >> algo_data[1] >> algo_data[2] >>
-          algo_data[3] >> algo_data[4] >> algo_data[5] >> algo_data[6] >>
-          algo_data[7];
-
-      for (int i = 0; i < 8; ++i) {
-        algo_caches_[seed].data[i] = algo_data[i];
-      }
-    }
-    infile.close();
-  }
-
-  // Serialize algo_caches_ to cache file
-  void SerializeAlgoCachesToFile() {
-    if (search_times_ > 0) {
-      int dev;
-      cudaGetDevice(&dev);
-      if (dev == 0) {
-        std::ofstream outfile;
-        outfile.open(algo_caches_file_, std::ios::out | std::ios::trunc);
-        outfile << dynload::cublasLtGetCudartVersion() << std::endl;
-
-        for (const auto& [seed, algo] : algo_caches_) {
-          outfile << seed << " ";
-          for (size_t value : algo.data) {
-            outfile << value << " ";
-          }
-          outfile << std::endl;
-        }
-        outfile.close();
-      }
-    }
-  }
-
-  inline int64_t RoundToNextHighPowOfTwo(int64_t n, int64_t min_val) {
-    n--;
-    n |= (n >> 1);
-    n |= (n >> 2);
-    n |= (n >> 4);
-    n |= (n >> 8);
-    n |= (n >> 16);
-    return std::max(min_val, (n + 1));
-  }
-
-  void HashMatmulDesc(cublasLtMatmulDesc_t desc,
-                      int64_t* seed,
-                      const std::hash<int64_t>& hash_fn) {
-    size_t size_to_write;
-    int trans_a, trans_b;
-    uint32_t epilogue;
-    // int8_t fast_accum;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescGetAttribute(desc,
-                                                CUBLASLT_MATMUL_DESC_TRANSA,
-                                                &trans_a,
-                                                sizeof(trans_a),
-                                                &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(trans_a));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescGetAttribute(desc,
-                                                CUBLASLT_MATMUL_DESC_TRANSB,
-                                                &trans_b,
-                                                sizeof(trans_b),
-                                                &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(trans_b));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescGetAttribute(desc,
-                                                CUBLASLT_MATMUL_DESC_EPILOGUE,
-                                                &epilogue,
-                                                sizeof(epilogue),
-                                                &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(epilogue));
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(
-    //     dyl::cublasLtMatmulDescGetAttribute(desc,
-    //                                         CUBLASLT_MATMUL_DESC_FAST_ACCUM,
-    //                                         &fast_accum,
-    //                                         sizeof(fast_accum),
-    //                                         &size_to_write));
-    // HashValue(seed, hash_fn, static_cast<int64_t>(fast_accum));
-  }
-
-  void HashMatrixLayoutDesc(cublasLtMatrixLayout_t desc,
-                            int64_t* seed,
-                            const std::hash<int64_t>& hash_fn) {
-    size_t size_to_write;
-    uint32_t dtype;
-    int32_t batch;
-    uint64_t row, col;
-    int64_t ld, batch_offset;
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatrixLayoutGetAttribute(desc,
-                                                  CUBLASLT_MATRIX_LAYOUT_TYPE,
-                                                  &dtype,
-                                                  sizeof(dtype),
-                                                  &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(dtype));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-        &batch,
-        sizeof(batch),
-        &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(batch));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_ROWS, &row, sizeof(row), &size_to_write));
-    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(row, 32));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_COLS, &col, sizeof(col), &size_to_write));
-    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(col, 32));
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
-        desc, CUBLASLT_MATRIX_LAYOUT_LD, &ld, sizeof(ld), &size_to_write));
-    HashValue(seed, hash_fn, RoundToNextHighPowOfTwo(ld, 32));
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-    //     desc, CUBLASLT_MATRIX_LAYOUT_ROWS, &row, sizeof(row),
-    //     &size_to_write));
-    // HashValue(seed, hash_fn, row);
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-    //     desc, CUBLASLT_MATRIX_LAYOUT_COLS, &col, sizeof(col),
-    //     &size_to_write));
-    // HashValue(seed, hash_fn, col);
-
-    // PADDLE_ENFORCE_GPU_SUCCESS(dyl::cublasLtMatrixLayoutGetAttribute(
-    //     desc, CUBLASLT_MATRIX_LAYOUT_LD, &ld, sizeof(ld), &size_to_write));
-    // HashValue(seed, hash_fn, ld);
-
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutGetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
-        &batch_offset,
-        sizeof(batch_offset),
-        &size_to_write));
-    HashValue(seed, hash_fn, static_cast<int64_t>(batch_offset));
-  }
-
-  void HashValue(int64_t* seed,
-                 const std::hash<int64_t>& hash_fn,
-                 int64_t value) {
-    *seed ^= hash_fn(value) + 0x9e3779b9 + (*seed << 6) + (*seed >> 2);
-  }
-};
-
-}  // namespace cublaslt_internal
-}  // namespace funcs
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/blaslt_impl.cu.h b/backends/metax_gpu/kernels/funcs/blas/blaslt_impl.cu.h
deleted file mode 100755
index d98182abef3..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/blaslt_impl.cu.h
+++ /dev/null
@@ -1,1137 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-
-#include <cuda_runtime_api.h>  // NOLINT
-
-#include "cuda.h"  // NOLINT
-#include "glog/logging.h"
-// #include "paddle/phi/backends/dynload/cublasLt.h"
-#include "paddle/phi/backends/gpu/cuda/cuda_helper.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/flags.h"
-#include "paddle/phi/kernels/autotune/gpu_timer.h"
-#include "paddle/phi/kernels/autotune/switch_autotune.h"
-
-PHI_DECLARE_int64(cublaslt_exhaustive_search_times);
-#endif
-
-namespace phi {
-namespace funcs {
-
-#if (defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0)
-
-// Set this enum according to
-// https://docs.nvidia.com/cuda/cublas/index.html#cublasltepilogue-t
-// While kMatmul, kMatmulGrad, kMatmulGradWithoutBias share the same
-// enum value, but if all elements for MatmulPlanner->GetKey() is same,
-// no matter forward or backward, they could share the same descriptor
-// cache, in that the descriptor is for description of matmul operation.
-enum MatmulFusedType {
-  kMatmul = 0,
-  kMatmulGrad = 1,
-  kMatmulGradWithoutBias = 2,
-  kMatmulBias = 3,
-  kMatmulRelu = 4,
-  kMatmulBiasRelu = 5,
-  kMatmulBiasGelu = 6,
-  kMatmulBiasReluWithReservedData = 7,
-  kMatmulBiasGeluWithReservedData = 8,
-  kMatmulReluGrad = 9,
-  kMatmulGeluGrad = 10,
-  kMatmulBiasGradToA = 11,
-  kMatmulBiasGradToB = 12
-};
-
-static cublasLtEpilogue_t ConvertFusedType(MatmulFusedType fused_type) {
-  static std::map<MatmulFusedType, cublasLtEpilogue_t> fused_type_map = {
-      {MatmulFusedType::kMatmul, CUBLASLT_EPILOGUE_DEFAULT},
-      {MatmulFusedType::kMatmulGrad, CUBLASLT_EPILOGUE_DEFAULT},
-      {MatmulFusedType::kMatmulGradWithoutBias, CUBLASLT_EPILOGUE_DEFAULT},
-      {MatmulFusedType::kMatmulBias, CUBLASLT_EPILOGUE_BIAS},
-      {MatmulFusedType::kMatmulRelu, CUBLASLT_EPILOGUE_RELU},
-      {MatmulFusedType::kMatmulBiasRelu, CUBLASLT_EPILOGUE_RELU_BIAS},
-      {MatmulFusedType::kMatmulBiasGelu, CUBLASLT_EPILOGUE_GELU_BIAS},
-      {MatmulFusedType::kMatmulBiasReluWithReservedData,
-       CUBLASLT_EPILOGUE_RELU_AUX_BIAS},
-      {MatmulFusedType::kMatmulBiasGeluWithReservedData,
-       CUBLASLT_EPILOGUE_GELU_AUX_BIAS},
-      {MatmulFusedType::kMatmulReluGrad, CUBLASLT_EPILOGUE_DRELU},
-      {MatmulFusedType::kMatmulGeluGrad, CUBLASLT_EPILOGUE_DGELU},
-      {MatmulFusedType::kMatmulBiasGradToA, CUBLASLT_EPILOGUE_BGRADA},
-      {MatmulFusedType::kMatmulBiasGradToB, CUBLASLT_EPILOGUE_BGRADB}};
-
-  return fused_type_map[fused_type];
-}
-
-enum FusedGEMMGradInType { kDX = 0, kDY = 1, kDZ = 2 };
-
-template <bool TransX, bool TransY>
-struct FusedGEMMGradTrait;
-
-template <>
-struct FusedGEMMGradTrait<false, false> {
-  static constexpr auto kXGradA = FusedGEMMGradInType::kDZ;
-  static constexpr auto kXGradB = FusedGEMMGradInType::kDY;
-  static constexpr auto kXGradATrans = false;
-  static constexpr auto kXGradBTrans = true;
-
-  static constexpr auto kYGradA = FusedGEMMGradInType::kDX;
-  static constexpr auto kYGradB = FusedGEMMGradInType::kDZ;
-  static constexpr auto kYGradATrans = true;
-  static constexpr auto kYGradBTrans = false;
-};
-
-template <>
-struct FusedGEMMGradTrait<true, false> {
-  static constexpr auto kXGradA = FusedGEMMGradInType::kDY;
-  static constexpr auto kXGradB = FusedGEMMGradInType::kDZ;
-  static constexpr auto kXGradATrans = false;
-  static constexpr auto kXGradBTrans = true;
-
-  static constexpr auto kYGradA = FusedGEMMGradInType::kDX;
-  static constexpr auto kYGradB = FusedGEMMGradInType::kDZ;
-  static constexpr auto kYGradATrans = false;
-  static constexpr auto kYGradBTrans = false;
-};
-
-template <>
-struct FusedGEMMGradTrait<false, true> {
-  static constexpr auto kXGradA = FusedGEMMGradInType::kDZ;
-  static constexpr auto kXGradB = FusedGEMMGradInType::kDY;
-  static constexpr auto kXGradATrans = false;
-  static constexpr auto kXGradBTrans = false;
-
-  static constexpr auto kYGradA = FusedGEMMGradInType::kDZ;
-  static constexpr auto kYGradB = FusedGEMMGradInType::kDX;
-  static constexpr auto kYGradATrans = true;
-  static constexpr auto kYGradBTrans = false;
-};
-
-template <>
-struct FusedGEMMGradTrait<true, true> {
-  static constexpr auto kXGradA = FusedGEMMGradInType::kDY;
-  static constexpr auto kXGradB = FusedGEMMGradInType::kDZ;
-  static constexpr auto kXGradATrans = true;
-  static constexpr auto kXGradBTrans = true;
-
-  static constexpr auto kYGradA = FusedGEMMGradInType::kDZ;
-  static constexpr auto kYGradB = FusedGEMMGradInType::kDX;
-  static constexpr auto kYGradATrans = true;
-  static constexpr auto kYGradBTrans = true;
-};
-
-// To tell any matmul or fused matmul operation from each other.
-struct MatmulPlanner {
- public:
-  const void* bias{nullptr};
-  void* aux_data{nullptr};
-
-  MatmulPlanner() {}
-  MatmulPlanner(const std::vector<int64_t>& x_dims,
-                const std::vector<int64_t>& y_dims,
-                const bool trans_x,
-                const bool trans_y,
-                phi::DataType dtype,
-                MatmulFusedType fused_type,
-                const void* bias_data = nullptr,
-                void* reserve_data = nullptr,  // Commonly for ReLu bit-mask.
-                bool use_addto = false,
-                bool no_exchange = true)
-      : bias(bias_data), aux_data(reserve_data), fused_type_(fused_type) {
-    use_addto_ = use_addto;
-    key_ = phi::autotune::GenKey(x_dims,
-                                 y_dims,
-                                 static_cast<int>(trans_x),
-                                 static_cast<int>(trans_y),
-                                 static_cast<int>(dtype),
-                                 static_cast<int>(fused_type_),
-                                 static_cast<int>(use_addto_),
-                                 static_cast<int>(no_exchange));
-  }
-
-  bool UseAddTo() const { return use_addto_; }
-  size_t GetKey() const { return key_; }
-  MatmulFusedType GetFusedType() const { return fused_type_; }
-
-  size_t GenSubKey() const { return key_; }
-
- private:
-  MatmulFusedType fused_type_;
-  bool use_addto_;
-  size_t key_;
-};
-
-template <typename T>
-cublasComputeType_t GetCudaComputeType() {
-  if (std::is_same<T, double>::value) {
-    return CUBLAS_COMPUTE_64F;
-  } else if (std::is_same<T, int8_t>::value) {
-    return CUBLAS_COMPUTE_32I;
-  } else {
-    return CUBLAS_COMPUTE_32F;
-  }
-}
-
-struct MatmulDescriptor {
- public:
-  cublasLtMatmulDesc_t op_desc{nullptr};
-  cublasLtMatrixLayout_t x_desc{nullptr};
-  cublasLtMatrixLayout_t y_desc{nullptr};
-  cublasLtMatrixLayout_t out_desc{nullptr};
-  cublasLtMatmulAlgo_t* algo{nullptr};
-  bool is_cached{false};
-
-  MatmulDescriptor() {}
-  MatmulDescriptor(const MatmulDescriptor& obj) {
-    algo = obj.algo;
-    x_desc = obj.x_desc;
-    y_desc = obj.y_desc;
-    op_desc = obj.op_desc;
-    out_desc = obj.out_desc;
-    is_cached = obj.is_cached;
-  }
-
-  MatmulDescriptor& operator=(const MatmulDescriptor& obj) {
-    algo = obj.algo;
-    x_desc = obj.x_desc;
-    y_desc = obj.y_desc;
-    op_desc = obj.op_desc;
-    out_desc = obj.out_desc;
-    is_cached = obj.is_cached;
-
-    return *this;
-  }
-
-  ~MatmulDescriptor() PADDLE_MAY_THROW {
-    if (!is_cached) {
-      PADDLE_WARN_GPU_SUCCESS(dynload::cublasLtMatmulDescDestroy(op_desc));
-      PADDLE_WARN_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(y_desc));
-      PADDLE_WARN_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(x_desc));
-      PADDLE_WARN_GPU_SUCCESS(dynload::cublasLtMatrixLayoutDestroy(out_desc));
-      delete algo;
-
-      op_desc = nullptr;
-      x_desc = nullptr;
-      y_desc = nullptr;
-      out_desc = nullptr;
-      algo = nullptr;
-    }
-  }
-
-  // x_desc, y_desc, op_desc are allocated in heap memory.
-  template <typename T, typename DXT, typename DYT, bool TransX, bool TransY>
-  void Create(const int64_t M,
-              const int64_t N,
-              const int64_t K,
-              const bool trans_x,
-              const bool trans_y,
-              phi::funcs::MatmulPlanner* planner,
-              const int batch_size = 1,
-              const int64_t stride_x = 0,
-              const int64_t stride_y = 0,
-              const int64_t stride_out = 0,
-              bool grad_for_dx = true) {
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    cudaDataType_t mat_type = phi::backends::gpu::ToCudaDataType<T>();
-    cudaDataType_t out_mat_type = phi::backends::gpu::ToCudaDataType<T>();
-    cudaDataType_t scale_type = phi::backends::gpu::ToCudaDataType<MT>();
-    cublasComputeType_t compute_type = GetCudaComputeType<T>();
-
-    if (std::is_same<T, int8_t>::value) {
-      out_mat_type = phi::backends::gpu::ToCudaDataType<int32_t>();
-      scale_type = phi::backends::gpu::ToCudaDataType<int32_t>();
-    }
-
-    // Create operation descriptor; see cublasLtMatmulDescAttributes_t for
-    // details about defaults; just need to set the transforms for A and B
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescCreate(&op_desc, compute_type, scale_type));
-    SetFusedEpilogueOpDescriptor(planner, trans_x, trans_y, N);
-
-    // Create matrix descriptors
-    CreateMatrixLayout(&x_desc, mat_type, M, K, trans_x);
-    CreateMatrixLayout(&y_desc, mat_type, K, N, trans_y);
-    CreateMatrixLayout(&out_desc, out_mat_type, M, N, false);
-
-    // Config batch size and stride.
-    if (batch_size > 1) {
-      SetBatchAndStride(x_desc, batch_size, stride_x);
-      SetBatchAndStride(y_desc, batch_size, stride_y);
-      SetBatchAndStride(out_desc, batch_size, stride_out);
-    }
-  }
-
-  cublasLtMatmulAlgo_t* SetAlgo() {
-    // while entering this function, the desc shall be cached.
-    is_cached = true;
-    algo = new cublasLtMatmulAlgo_t;
-    return algo;
-  }
-
-  template <typename T>
-  void SetFusedEpiloguePtr(phi::funcs::MatmulPlanner* planner) {
-    if (planner->bias != nullptr) {
-      const T* bias_data = static_cast<const T*>(planner->bias);
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute(
-          op_desc,
-          CUBLASLT_MATMUL_DESC_BIAS_POINTER,
-          &bias_data,
-          sizeof(bias_data)));
-    }
-    if (planner->aux_data != nullptr) {
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute(
-          op_desc,
-          CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER,
-          &(planner->aux_data),
-          sizeof(planner->aux_data)));
-    }
-  }
-
-  std::string GetDescResultString(std::string prefix,
-                                  bool has_algo = true) const {
-    std::ostringstream out;
-    out << prefix << " \n";
-#define GET_DESC_DATA_STRING(src)                    \
-  do {                                               \
-    out << "  " << #src << " = [";                   \
-    int num = sizeof((*src)) / sizeof(src->data[0]); \
-    for (int i = 0; i < num; ++i) {                  \
-      if (i == 0) {                                  \
-        out << src->data[i];                         \
-      } else {                                       \
-        out << ", " << src->data[i];                 \
-      }                                              \
-    }                                                \
-    out << "]\n";                                    \
-  } while (0);
-
-    if (has_algo) {
-      GET_DESC_DATA_STRING(algo);
-    }
-    GET_DESC_DATA_STRING(x_desc);
-    GET_DESC_DATA_STRING(y_desc);
-    GET_DESC_DATA_STRING(out_desc);
-    GET_DESC_DATA_STRING(op_desc);
-#undef GET_DESC_DATA_STRING
-    return out.str();
-  }
-
-  void ExchangeXYDesc(bool no_exchange) {}
-
- protected:
-  void SetFusedEpilogueOpDescriptor(phi::funcs::MatmulPlanner* planner,
-                                    const bool trans_x,
-                                    const bool trans_y,
-                                    int64_t lead_dim) {
-    cublasOperation_t cublas_trans_x = trans_x ? CUBLAS_OP_T : CUBLAS_OP_N;
-    cublasOperation_t cublas_trans_y = trans_y ? CUBLAS_OP_T : CUBLAS_OP_N;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescSetAttribute(op_desc,
-                                                CUBLASLT_MATMUL_DESC_TRANSB,
-                                                &cublas_trans_x,
-                                                sizeof(cublas_trans_x)));
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescSetAttribute(op_desc,
-                                                CUBLASLT_MATMUL_DESC_TRANSA,
-                                                &cublas_trans_y,
-                                                sizeof(cublas_trans_y)));
-    MatmulFusedType fused_type = planner->GetFusedType();
-    if (fused_type != MatmulFusedType::kMatmul) {
-      cublasLtEpilogue_t cublaslt_fused_type = ConvertFusedType(fused_type);
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          dynload::cublasLtMatmulDescSetAttribute(op_desc,
-                                                  CUBLASLT_MATMUL_DESC_EPILOGUE,
-                                                  &cublaslt_fused_type,
-                                                  sizeof(fused_type)));
-    }
-    if (planner->aux_data) {
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulDescSetAttribute(
-          op_desc,
-          CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD,
-          &lead_dim,
-          sizeof(lead_dim)));
-    }
-  }
-
-  void CreateMatrixLayout(cublasLtMatrixLayout_t* desc,
-                          cudaDataType type,
-                          uint64_t rows,
-                          uint64_t cols,
-                          bool trans) {
-    if (trans) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          dynload::cublasLtMatrixLayoutCreate(desc, type, rows, cols, rows));
-    } else {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          dynload::cublasLtMatrixLayoutCreate(desc, type, cols, rows, cols));
-    }
-  }
-
-  void SetBatchAndStride(cublasLtMatrixLayout_t desc,
-                         int batch_size,
-                         int64_t stride) {
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutSetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT,
-        &batch_size,
-        sizeof(batch_size)));
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatrixLayoutSetAttribute(
-        desc,
-        CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET,
-        &stride,
-        sizeof(stride)));
-  }
-};
-
-struct MatmulGradDescriptor : MatmulDescriptor {
- public:
-  MatmulGradDescriptor() {}
-
-  template <typename T, typename DXT, typename DYT, bool TransX, bool TransY>
-  void Create(const int64_t M,
-              const int64_t N,
-              const int64_t K,
-              const bool trans_x,
-              const bool trans_y,
-              phi::funcs::MatmulPlanner* planner,
-              const int batch_size = 1,
-              int64_t stride_x = 0,
-              int64_t stride_y = 0,
-              int64_t stride_out = 0,
-              bool grad_for_dx = true) {
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    cudaDataType_t mat_type = phi::backends::gpu::ToCudaDataType<T>();
-    cudaDataType_t scale_type = phi::backends::gpu::ToCudaDataType<MT>();
-    cublasComputeType_t compute_type = GetCudaComputeType<T>();
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulDescCreate(&op_desc, compute_type, scale_type));
-    this->SetFusedEpilogueOpDescriptor(
-        planner, trans_x, trans_y, TransX ? M : K);
-
-    // Create operation desciriptor; see cublasLtMatmulDescAttributes_t for
-    // details about defaults; just need to set the transforms for A and B
-    this->CreateMatrixLayout(&x_desc, mat_type, N, M, true);
-    if (grad_for_dx) {
-      this->CreateMatrixLayout(&y_desc, mat_type, K, N, TransY);
-      this->CreateMatrixLayout(
-          &out_desc, phi::backends::gpu::ToCudaDataType<DXT>(), M, K, TransX);
-    } else {
-      this->CreateMatrixLayout(&y_desc, mat_type, M, K, TransX);
-      this->CreateMatrixLayout(
-          &out_desc, phi::backends::gpu::ToCudaDataType<DYT>(), K, N, TransY);
-    }
-  }
-
-  void ExchangeXYDesc(bool no_exchange) {
-    if (no_exchange) {
-      return;
-    }
-    auto* temp = y_desc;
-    y_desc = x_desc;
-    x_desc = temp;
-  }
-};
-
-template <typename T, typename OutT = T, class MatmulDescT = MatmulDescriptor>
-struct CublasLtBase {
- public:
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-  static phi::Allocator::AllocationPtr GetWorkspace(const phi::GPUContext& ctx,
-                                                    size_t workspace_size) {
-    return phi::memory_utils::Alloc(
-        ctx.GetPlace(),
-        workspace_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  }
-
-  static void RunImpl(const phi::GPUContext& ctx,
-                      MatmulDescT* desc,
-                      const size_t sub_key,
-                      const T* x_ptr,
-                      const T* y_ptr,
-                      OutT* out_ptr,
-                      phi::funcs::MatmulPlanner* planner) {
-    MT alpha = static_cast<MT>(1);
-    MT beta = planner->UseAddTo() ? static_cast<MT>(1) : static_cast<MT>(0);
-    cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle();
-
-    // NOTE(limingshu): As workspace_size varies from different DL framework,
-    // I wonder is there any smarter idea for workspace setting, currently I
-    // just followed the settings from the NVIDIA colleague`s setting.
-    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
-    phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, workspace_size);
-
-    if (planner != nullptr) {
-      if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() &&
-          (!desc->is_cached)) {
-        SearchBestAlgo(ctx,
-                       cublaslt_handle,
-                       desc,
-                       static_cast<void*>(&alpha),
-                       static_cast<void*>(&beta),
-                       y_ptr,
-                       x_ptr,
-                       out_ptr,
-                       workspace->ptr(),
-                       workspace_size);
-        MatmulDescT* best_desc = new MatmulDescT(*desc);
-        VLOG(6) << best_desc->GetDescResultString(
-            "[Searched CublasltDescriptor] ");
-
-        auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
-        cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
-      }
-    }
-
-    VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] ");
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmul(cublaslt_handle,
-                                desc->op_desc,
-                                static_cast<void*>(&alpha),
-                                y_ptr,
-                                desc->y_desc,
-                                x_ptr,
-                                desc->x_desc,
-                                static_cast<void*>(&beta),
-                                out_ptr,
-                                desc->out_desc,
-                                out_ptr,
-                                desc->out_desc,
-                                desc->algo,
-                                workspace->ptr(),
-                                workspace_size,
-                                ctx.stream()));
-  }
-
-  static void SearchBestAlgo(const phi::GPUContext& ctx,
-                             const cublasLtHandle_t& lt_handle,
-                             MatmulDescT* desc,
-                             const void* alpha,
-                             const void* beta,
-                             const void* y_data,
-                             const void* x_data,
-                             void* out_data,
-                             void* workspace_ptr,
-                             size_t workspace_size) {
-    cublasLtMatmulPreference_t preference;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulPreferenceCreate(&preference));
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute(
-        preference,
-        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-        &workspace_size,
-        sizeof(workspace_size)));
-
-    int returned_results = 0;
-    constexpr int requested_algo_count = 10;
-    std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
-        requested_algo_count);
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle,
-                                                desc->op_desc,
-                                                desc->y_desc,
-                                                desc->x_desc,
-                                                desc->out_desc,
-                                                desc->out_desc,
-                                                preference,
-                                                requested_algo_count,
-                                                heuristic_results.data(),
-                                                &returned_results));
-    PADDLE_ENFORCE_GT(returned_results,
-                      0,
-                      phi::errors::Unavailable("No GEMM algorithm avaliable."));
-    int best_algo_idx = -1;
-    if (returned_results == 1 || FLAGS_cublaslt_exhaustive_search_times <= 0) {
-      best_algo_idx = 0;
-    } else {
-      float min_time_cost = std::numeric_limits<float>::max();
-      for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
-        float cur_time_cost =
-            RunAndMeasureAlgo(ctx,
-                              lt_handle,
-                              desc,
-                              alpha,
-                              beta,
-                              y_data,
-                              x_data,
-                              out_data,
-                              workspace_ptr,
-                              workspace_size,
-                              &(heuristic_results[algo_idx].algo));
-        VLOG(6) << "[MatmulWithCublaslt] algo[" << algo_idx
-                << "] time: " << cur_time_cost << " s";
-
-        if ((best_algo_idx == 0 && (1.05 * cur_time_cost < min_time_cost)) ||
-            (cur_time_cost < min_time_cost)) {
-          best_algo_idx = algo_idx;
-          min_time_cost = cur_time_cost;
-        }
-      }
-    }
-    VLOG(6) << "[MatmulWithCublaslt] best_algo_idx: " << best_algo_idx;
-
-    cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo();
-    *best_algo = heuristic_results[best_algo_idx].algo;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulPreferenceDestroy(preference));
-  }
-
-  static float RunAndMeasureAlgo(const phi::GPUContext& ctx,
-                                 const cublasLtHandle_t& lt_handle,
-                                 MatmulDescT* desc,
-                                 const void* alpha,
-                                 const void* beta,
-                                 const void* y_data,
-                                 const void* x_data,
-                                 void* out_data,
-                                 void* workspace_ptr,
-                                 size_t workspace_size,
-                                 cublasLtMatmulAlgo_t* algo) {
-    int repeats = FLAGS_cublaslt_exhaustive_search_times;
-    if (repeats <= 0) {
-      return std::numeric_limits<float>::max();
-    }
-
-    phi::GpuTimer timer;
-    float time_cost = 0.f;
-    const auto& stream = ctx.stream();
-
-    for (int i = 0; i < repeats; ++i) {
-      timer.Start(stream);
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul(lt_handle,
-                                                         desc->op_desc,
-                                                         alpha,
-                                                         y_data,
-                                                         desc->y_desc,
-                                                         x_data,
-                                                         desc->x_desc,
-                                                         beta,
-                                                         out_data,
-                                                         desc->out_desc,
-                                                         out_data,
-                                                         desc->out_desc,
-                                                         algo,
-                                                         workspace_ptr,
-                                                         workspace_size,
-                                                         stream));
-      timer.Stop(stream);
-      ctx.Wait();
-      auto time = timer.ElapsedTime();
-      if (i > 0) {
-        // Exclude the warmup runtime.
-        time_cost += time;
-      }
-    }
-    return (time_cost / (repeats - 1));
-  }
-};
-
-template <>
-struct CublasLtBase<int8_t, int32_t, MatmulDescriptor> {
- public:
-  static phi::Allocator::AllocationPtr GetWorkspace(const phi::GPUContext& ctx,
-                                                    size_t workspace_size) {
-    return phi::memory_utils::Alloc(
-        ctx.GetPlace(),
-        workspace_size,
-        phi::Stream(reinterpret_cast<phi::StreamId>(ctx.stream())));
-  }
-
-  static void RunImpl(const phi::GPUContext& ctx,
-                      MatmulDescriptor* desc,
-                      const size_t sub_key,
-                      const int8_t* x_ptr,
-                      const int8_t* y_ptr,
-                      int32_t* out_ptr,
-                      phi::funcs::MatmulPlanner* planner) {
-    int32_t alpha = 1;
-    int32_t beta =
-        planner->UseAddTo() ? static_cast<int32_t>(1) : static_cast<int32_t>(0);
-    cublasLtHandle_t cublaslt_handle = ctx.cublaslt_handle();
-
-    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
-    phi::Allocator::AllocationPtr workspace = GetWorkspace(ctx, workspace_size);
-
-    if (planner != nullptr) {
-      if (phi::autotune::AutoTuneStatus::Instance().UseAutoTune() &&
-          (!desc->is_cached)) {
-        SearchBestAlgo(ctx,
-                       cublaslt_handle,
-                       desc,
-                       static_cast<void*>(&alpha),
-                       static_cast<void*>(&beta),
-                       y_ptr,
-                       x_ptr,
-                       out_ptr,
-                       workspace->ptr(),
-                       workspace_size);
-        MatmulDescriptor* best_desc = new MatmulDescriptor(*desc);
-        VLOG(6) << best_desc->GetDescResultString(
-            "[Searched CublasltDescriptor] ");
-
-        auto& cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
-        cache.SetSubKey(sub_key, reinterpret_cast<void*>(best_desc));
-      }
-    }
-
-    VLOG(7) << desc->GetDescResultString("[Impl CublasltDescriptor] ");
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmul(cublaslt_handle,
-                                desc->op_desc,
-                                static_cast<void*>(&alpha),
-                                y_ptr,
-                                desc->y_desc,
-                                x_ptr,
-                                desc->x_desc,
-                                static_cast<void*>(&beta),
-                                out_ptr,
-                                desc->out_desc,
-                                out_ptr,
-                                desc->out_desc,
-                                desc->algo,
-                                workspace->ptr(),
-                                workspace_size,
-                                ctx.stream()));
-  }
-
-  static void SearchBestAlgo(const phi::GPUContext& ctx,
-                             const cublasLtHandle_t& lt_handle,
-                             MatmulDescriptor* desc,
-                             const void* alpha,
-                             const void* beta,
-                             const void* y_data,
-                             const void* x_data,
-                             void* out_data,
-                             void* workspace_ptr,
-                             size_t workspace_size) {
-    cublasLtMatmulPreference_t preference;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulPreferenceCreate(&preference));
-    PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmulPreferenceSetAttribute(
-        preference,
-        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-        &workspace_size,
-        sizeof(workspace_size)));
-
-    int returned_results = 0;
-    constexpr int requested_algo_count = 10;
-    std::vector<cublasLtMatmulHeuristicResult_t> heuristic_results(
-        requested_algo_count);
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulAlgoGetHeuristic(lt_handle,
-                                                desc->op_desc,
-                                                desc->y_desc,
-                                                desc->x_desc,
-                                                desc->out_desc,
-                                                desc->out_desc,
-                                                preference,
-                                                requested_algo_count,
-                                                heuristic_results.data(),
-                                                &returned_results));
-    PADDLE_ENFORCE_GT(returned_results,
-                      0,
-                      phi::errors::Unavailable("No GEMM algorithm avaliable."));
-    int best_algo_idx = -1;
-    if (returned_results == 1 || FLAGS_cublaslt_exhaustive_search_times <= 0) {
-      best_algo_idx = 0;
-    } else {
-      float min_time_cost = std::numeric_limits<float>::max();
-      for (int algo_idx = 0; algo_idx < returned_results; ++algo_idx) {
-        float cur_time_cost =
-            RunAndMeasureAlgo(ctx,
-                              lt_handle,
-                              desc,
-                              alpha,
-                              beta,
-                              y_data,
-                              x_data,
-                              out_data,
-                              workspace_ptr,
-                              workspace_size,
-                              &(heuristic_results[algo_idx].algo));
-        VLOG(6) << "[MatmulWithCublaslt] algo[" << algo_idx
-                << "] time: " << cur_time_cost << " s";
-
-        if ((best_algo_idx == 0 && (1.05 * cur_time_cost < min_time_cost)) ||
-            (cur_time_cost < min_time_cost)) {
-          best_algo_idx = algo_idx;
-          min_time_cost = cur_time_cost;
-        }
-      }
-    }
-    VLOG(6) << "[MatmulWithCublaslt] best_algo_idx: " << best_algo_idx;
-
-    cublasLtMatmulAlgo_t* best_algo = desc->SetAlgo();
-    *best_algo = heuristic_results[best_algo_idx].algo;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::cublasLtMatmulPreferenceDestroy(preference));
-  }
-
-  static float RunAndMeasureAlgo(const phi::GPUContext& ctx,
-                                 const cublasLtHandle_t& lt_handle,
-                                 MatmulDescriptor* desc,
-                                 const void* alpha,
-                                 const void* beta,
-                                 const void* y_data,
-                                 const void* x_data,
-                                 void* out_data,
-                                 void* workspace_ptr,
-                                 size_t workspace_size,
-                                 cublasLtMatmulAlgo_t* algo) {
-    int repeats = FLAGS_cublaslt_exhaustive_search_times;
-    if (repeats <= 0) {
-      return std::numeric_limits<float>::max();
-    }
-
-    phi::GpuTimer timer;
-    float time_cost = 0.f;
-    const auto& stream = ctx.stream();
-
-    for (int i = 0; i < repeats; ++i) {
-      timer.Start(stream);
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::cublasLtMatmul(lt_handle,
-                                                         desc->op_desc,
-                                                         alpha,
-                                                         y_data,
-                                                         desc->y_desc,
-                                                         x_data,
-                                                         desc->x_desc,
-                                                         beta,
-                                                         out_data,
-                                                         desc->out_desc,
-                                                         out_data,
-                                                         desc->out_desc,
-                                                         algo,
-                                                         workspace_ptr,
-                                                         workspace_size,
-                                                         stream));
-      timer.Stop(stream);
-      ctx.Wait();
-      auto time = timer.ElapsedTime();
-      if (i > 0) {
-        // Exclude the warmup runtime.
-        time_cost += time;
-      }
-    }
-    return (time_cost / (repeats - 1));
-  }
-};
-
-// To judge if desc is cached or not.
-template <class DescT,
-          typename T,
-          typename DXT = T,
-          typename DYT = T,
-          bool TransX = false,
-          bool TransY = false>
-struct DescriptorSetter {
- public:
-  DescT desc;
-  size_t sub_key{std::numeric_limits<size_t>::min()};
-
-  DescriptorSetter(phi::funcs::MatmulPlanner* planner,
-                   const int64_t M,
-                   const int64_t N,
-                   const int64_t K,
-                   const bool trans_x,
-                   const bool trans_y,
-                   const int batch_size = 1,
-                   int64_t stride_x = 0,
-                   int64_t stride_y = 0,
-                   int64_t stride_out = 0,
-                   const bool no_exchange = true,
-                   bool grad_for_dx = true) {
-    if (std::is_same<T, int8_t>::value) {
-      if (!trans_x && !trans_y) {
-        PADDLE_ENFORCE_EQ(
-            (N % 4 == 0 || N == 1),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size N used in int8 matmul must be 1 or a "
-                "multiple of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                N));
-        PADDLE_ENFORCE_EQ(
-            (K % 4 == 0),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size K used in int8 matmul must be a multiple "
-                "of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                K));
-      } else if (!trans_x && trans_y) {
-        PADDLE_ENFORCE_EQ(
-            (K % 4 == 0),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size K used in int8 matmul must be a multiple "
-                "of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                K));
-      } else if (trans_x && !trans_y) {
-        PADDLE_ENFORCE_EQ(
-            (M % 4 == 0 || M == 1),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size M used in int8 matmul must be 1 or a "
-                "multiple of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                M));
-        PADDLE_ENFORCE_EQ(
-            (N % 4 == 0 || N == 1),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size N used in int8 matmul must be 1 or a "
-                "multiple of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                N));
-      } else {
-        PADDLE_ENFORCE_EQ(
-            (M % 4 == 0 || M == 1),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size M used in int8 matmul must be 1 or a "
-                "multiple of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                M));
-        PADDLE_ENFORCE_EQ(
-            (K % 4 == 0),
-            true,
-            phi::errors::InvalidArgument(
-                "The dimension size K used in int8 matmul must be a multiple "
-                "of 4 does not "
-                "match the size (%d) currently contained in the container.",
-                K));
-      }
-    }
-
-    if (planner != nullptr) {
-      sub_key = planner->GenSubKey();
-    }
-
-    auto& mamtul_cache = phi::autotune::AutoTuneCache::Instance().GetMatmul();
-    if (mamtul_cache.FindSubKey(sub_key)) {
-      desc = *(reinterpret_cast<DescT*>(mamtul_cache.GetSubKey(sub_key)));
-      desc.template SetFusedEpiloguePtr<DYT>(planner);
-      VLOG(7) << desc.GetDescResultString("[Heap CublasltDescriptor] ");
-    } else {
-      desc.template Create<T, DXT, DYT, TransX, TransY>(M,
-                                                        N,
-                                                        K,
-                                                        trans_x,
-                                                        trans_y,
-                                                        planner,
-                                                        batch_size,
-                                                        stride_x,
-                                                        stride_y,
-                                                        stride_out,
-                                                        grad_for_dx);
-      desc.ExchangeXYDesc(no_exchange);
-      if (planner != nullptr) {
-        desc.template SetFusedEpiloguePtr<DYT>(planner);
-      }
-      VLOG(7) << desc.GetDescResultString("[Stack CublasltDescriptor] ", false);
-    }
-  }
-};
-
-// For matmul with kernels autotune
-template <typename T, typename OutT = T>
-struct MatmulWithCublasLt : public CublasLtBase<T, OutT> {
- public:
-  static void Run(const phi::GPUContext& ctx,
-                  const T* x_data,
-                  const T* y_data,
-                  OutT* out_data,
-                  const int64_t M,
-                  const int64_t N,
-                  const int64_t K,
-                  const bool trans_x,
-                  const bool trans_y,
-                  phi::funcs::MatmulPlanner* planner = nullptr) {
-    auto setter = DescriptorSetter<MatmulDescriptor, T>(
-        planner, M, N, K, trans_x, trans_y);
-    CublasLtBase<T, OutT>::RunImpl(
-        ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner);
-  }
-
-  static void RunWithBatch(const phi::GPUContext& ctx,
-                           const T* x_data,
-                           const T* y_data,
-                           OutT* out_data,
-                           const int64_t M,
-                           const int64_t N,
-                           const int64_t K,
-                           bool trans_x,
-                           bool trans_y,
-                           int batch_size,
-                           int64_t stride_x,
-                           int64_t stride_y,
-                           int64_t stride_out,
-                           phi::funcs::MatmulPlanner* planner = nullptr) {
-    auto setter = DescriptorSetter<MatmulDescriptor, T>(planner,
-                                                        M,
-                                                        N,
-                                                        K,
-                                                        trans_x,
-                                                        trans_y,
-                                                        batch_size,
-                                                        stride_x,
-                                                        stride_y,
-                                                        stride_out);
-    CublasLtBase<T, OutT>::RunImpl(
-        ctx, &setter.desc, setter.sub_key, x_data, y_data, out_data, planner);
-  }
-
-  static void RunWithBatch(const phi::GPUContext& ctx,
-                           const T** x_data,
-                           const T** y_data,
-                           OutT** out_data,
-                           const int64_t M,
-                           const int64_t N,
-                           const int64_t K,
-                           bool trans_x,
-                           bool trans_y,
-                           int batch_size,
-                           phi::funcs::MatmulPlanner* planner = nullptr) {
-    for (int i = 0; i < batch_size; ++i) {
-      Run(ctx,
-          x_data[i],
-          y_data[i],
-          out_data[i],
-          M,
-          N,
-          K,
-          trans_x,
-          trans_y,
-          planner);
-    }
-  }
-};
-
-// As for just Linear fused ephilogue below: out = matmul(x, y) + bias.
-template <typename T>
-struct LinearWithCublasLt : public CublasLtBase<T> {
-  static void Run(const phi::GPUContext& ctx,
-                  const phi::DenseTensor* x,
-                  const phi::DenseTensor* y,
-                  phi::DenseTensor* out,
-                  const void* bias_data,
-                  void* reserve_data,
-                  const int64_t M,
-                  const int64_t N,
-                  const int64_t K,
-                  const bool trans_x,
-                  const bool trans_y,
-                  const MatmulFusedType fused_type) {
-    auto planner = phi::funcs::MatmulPlanner(common::vectorize(x->dims()),
-                                             common::vectorize(y->dims()),
-                                             trans_x,
-                                             trans_y,
-                                             phi::CppTypeToDataType<T>::Type(),
-                                             fused_type,
-                                             bias_data,
-                                             reserve_data);
-    auto setter = DescriptorSetter<MatmulDescriptor, T>(
-        &planner, M, N, K, trans_x, trans_y);
-    CublasLtBase<T>::RunImpl(ctx,
-                             &setter.desc,
-                             setter.sub_key,
-                             x->data<T>(),
-                             y->data<T>(),
-                             out->data<T>(),
-                             &planner);
-  }
-};
-
-template <typename T, typename DXT, typename DYT, bool TransX, bool TransY>
-struct LinearGradWithCublasLt : public CublasLtBase<T> {
-  static void Run(
-      const phi::GPUContext& ctx,
-      const phi::DenseTensor* x,
-      const phi::DenseTensor* y,
-      phi::DenseTensor* out,
-      const void* bias_data,
-      void* reserve_data,
-      const int64_t M,
-      const int64_t N,
-      const int64_t K,
-      const MatmulFusedType fused_type,
-      const bool trans_x,
-      const bool trans_y,
-      const bool use_addto,
-      const bool no_exchange,  // exchange x_desc and y_desc for grad.
-      bool grad_for_dx = true) {
-    auto planner = phi::funcs::MatmulPlanner(common::vectorize(x->dims()),
-                                             common::vectorize(y->dims()),
-                                             trans_x,
-                                             trans_y,
-                                             phi::CppTypeToDataType<T>::Type(),
-                                             fused_type,
-                                             bias_data,
-                                             reserve_data,
-                                             use_addto,
-                                             no_exchange);
-    auto setter =
-        DescriptorSetter<MatmulGradDescriptor, T, DXT, DYT, TransX, TransY>(
-            &planner,
-            M,
-            N,
-            K,
-            trans_x,
-            trans_y,
-            /*batch_size=*/1,
-            /*stride_x=*/0,
-            /*stride_y=*/0,
-            /*stride_out=*/0,
-            /*exchange_x_y_desc=*/no_exchange,
-            /*grad_for_dx=*/grad_for_dx);
-
-    // To setting data type for different kinda out_data.
-    if (grad_for_dx) {
-      CublasLtBase<T, DXT, MatmulGradDescriptor>::RunImpl(
-          ctx,
-          &setter.desc,
-          setter.sub_key,
-          no_exchange ? x->data<T>() : y->data<T>(),
-          no_exchange ? y->data<T>() : x->data<T>(),
-          out->data<DXT>(),
-          &planner);
-    } else {
-      CublasLtBase<T, DYT, MatmulGradDescriptor>::RunImpl(
-          ctx,
-          &setter.desc,
-          setter.sub_key,
-          no_exchange ? x->data<T>() : y->data<T>(),
-          no_exchange ? y->data<T>() : x->data<T>(),
-          out->data<DYT>(),
-          &planner);
-    }
-  }
-};
-#else
-// A void structure just for successfully compile.
-struct MatmulPlanner {};
-#endif  // (PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-
-}  // namespace funcs
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/cublas.cc b/backends/metax_gpu/kernels/funcs/blas/cublas.cc
deleted file mode 100644
index 77a0cced00b..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/cublas.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cublas.h"  // NOLINT
-
-namespace phi {
-namespace dynload {
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
-
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2
-CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP);
-#endif
-
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R3
-CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP);
-#endif
-
-#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4
-CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP);
-#endif
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/cublas.h b/backends/metax_gpu/kernels/funcs/blas/cublas.h
deleted file mode 100755
index 776c7a1723b..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/cublas.h
+++ /dev/null
@@ -1,148 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-// clang-format off
-#pragma once
-#include <cublas_v2.h>
-#include <cuda.h>
-
-#include <mutex>  // NOLINT
-#include <type_traits>
-
-#include "kernels/dynload/dynamic_loader.h"
-#include "./port.h" // NOLINT
-// clang-format on
-namespace phi {
-namespace dynload {
-
-extern std::once_flag cublas_dso_flag;
-extern void* cublas_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cublas routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using cublas_func =                                                   \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
-      std::call_once(cublas_dso_flag, []() {                                \
-        cublas_dso_handle = phi::dynload::GetCublasDsoHandle();             \
-      });                                                                   \
-      std::string replaced_name = #__name;                                  \
-      replaced_name = replaced_name.replace(0, 2, "mc");                    \
-      int index = replaced_name.find("_", 0);                               \
-      if (index != -1) replaced_name = replaced_name.substr(0, index);      \
-      static void* p_##__name =                                             \
-          dlsym(cublas_dso_handle, replaced_name.c_str());                  \
-      return reinterpret_cast<cublas_func>(p_##__name)(args...);            \
-    }                                                                       \
-  };                                                                        \
-  extern DynLoad__##__name __name
-
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSaxpy_v2);                \
-  __macro(cublasDaxpy_v2);                \
-  __macro(cublasCaxpy_v2);                \
-  __macro(cublasZaxpy_v2);                \
-  __macro(cublasSscal_v2);                \
-  __macro(cublasDscal_v2);                \
-  __macro(cublasScopy_v2);                \
-  __macro(cublasDcopy_v2);                \
-  __macro(cublasSgemv_v2);                \
-  __macro(cublasDgemv_v2);                \
-  __macro(cublasCgemv_v2);                \
-  __macro(cublasZgemv_v2);                \
-  __macro(cublasSgemm_v2);                \
-  __macro(cublasDgemm_v2);                \
-  __macro(cublasCgemm_v2);                \
-  __macro(cublasZgemm_v2);                \
-  __macro(cublasHgemm);                   \
-  __macro(cublasSgemmEx);                 \
-  __macro(cublasSgeam);                   \
-  __macro(cublasDgeam);                   \
-  __macro(cublasStrsm_v2);                \
-  __macro(cublasDtrsm_v2);                \
-  __macro(cublasCtrsm_v2);                \
-  __macro(cublasZtrsm_v2);                \
-  __macro(cublasCreate_v2);               \
-  __macro(cublasDestroy_v2);              \
-  __macro(cublasSetStream_v2);            \
-  __macro(cublasSetPointerMode_v2);       \
-  __macro(cublasGetPointerMode_v2);       \
-  __macro(cublasSgemmBatched);            \
-  __macro(cublasDgemmBatched);            \
-  __macro(cublasCgemmBatched);            \
-  __macro(cublasZgemmBatched);            \
-  __macro(cublasStrsmBatched);            \
-  __macro(cublasDtrsmBatched);            \
-  __macro(cublasCtrsmBatched);            \
-  __macro(cublasZtrsmBatched);            \
-  __macro(cublasSgetrfBatched);           \
-  __macro(cublasSgetriBatched);           \
-  __macro(cublasDgetrfBatched);           \
-  __macro(cublasDgetriBatched);           \
-  __macro(cublasSmatinvBatched);          \
-  __macro(cublasDmatinvBatched);          \
-  __macro(cublasSgetrsBatched);           \
-  __macro(cublasDgetrsBatched);           \
-  __macro(cublasCgetrfBatched);           \
-  __macro(cublasCgetriBatched);           \
-  __macro(cublasCmatinvBatched);          \
-  __macro(cublasZgetrfBatched);           \
-  __macro(cublasZgetriBatched);           \
-  __macro(cublasZmatinvBatched);
-
-CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-
-// APIs available after CUDA 8.0
-#if CUDA_VERSION >= 8000
-#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \
-  __macro(cublasGemmEx);                     \
-  __macro(cublasSgemmStridedBatched);        \
-  __macro(cublasDgemmStridedBatched);        \
-  __macro(cublasCgemmStridedBatched);        \
-  __macro(cublasZgemmStridedBatched);        \
-  __macro(cublasHgemmStridedBatched);
-
-CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-#endif
-
-// APIs available after CUDA 9.0
-#if CUDA_VERSION >= 9000
-#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \
-  __macro(cublasSetMathMode);                \
-  __macro(cublasGetMathMode);
-
-CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-#endif
-
-// APIs available after CUDA 9.1
-#if CUDA_VERSION >= 9010
-#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \
-  __macro(cublasGemmBatchedEx);              \
-  __macro(cublasGemmStridedBatchedEx);
-
-CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP)
-#endif
-
-#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/cublasLt.cc b/backends/metax_gpu/kernels/funcs/blas/cublasLt.cc
deleted file mode 100644
index 776f7fdd812..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/cublasLt.cc
+++ /dev/null
@@ -1,27 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cublasLt.h"
-
-namespace phi {
-namespace dynload {
-std::once_flag cublasLt_dso_flag;
-void *cublasLt_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUBLASLT_BLAS_ROUTINE_EACH(DEFINE_WRAP);
-
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/cublasLt.h b/backends/metax_gpu/kernels/funcs/blas/cublasLt.h
deleted file mode 100644
index 2f8a929dd0c..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/cublasLt.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cublasLt.h>
-#include <cuda.h>
-
-#include <mutex>  // NOLINT
-#include <type_traits>
-
-#include "./port.h"
-#include "kernels/dynload/dynamic_loader.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag cublasLt_dso_flag;
-extern void* cublasLt_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cublasLt routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#define DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP(__name)                          \
-  struct DynLoad__##__name {                                                \
-    template <typename... Args>                                             \
-    inline auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using cublasLt_func =                                                 \
-          decltype(::__name(std::declval<Args>()...)) (*)(Args...);         \
-      std::call_once(cublasLt_dso_flag, []() {                              \
-        cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle();         \
-      });                                                                   \
-      std::string replaced_name = #__name;                                  \
-      replaced_name = replaced_name.replace(0, 2, "mc");                    \
-      static void* p_##__name =                                             \
-          dlsym(cublasLt_dso_handle, replaced_name.c_str());                \
-      return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
-    }                                                                       \
-  };                                                                        \
-  extern DynLoad__##__name __name
-
-// APIs available after CUDA 11.1
-#if CUDA_VERSION >= 11010
-#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
-  __macro(cublasLtCreate);                          \
-  __macro(cublasLtDestroy);                         \
-  __macro(cublasLtMatmul);                          \
-  __macro(cublasLtMatmulDescCreate);                \
-  __macro(cublasLtMatmulDescDestroy);               \
-  __macro(cublasLtMatmulDescSetAttribute);          \
-  __macro(cublasLtMatmulDescGetAttribute);          \
-  __macro(cublasLtMatrixLayoutCreate);              \
-  __macro(cublasLtMatrixLayoutDestroy);             \
-  __macro(cublasLtMatrixLayoutSetAttribute);        \
-  __macro(cublasLtMatrixLayoutGetAttribute);        \
-  __macro(cublasLtMatmulPreferenceCreate);          \
-  __macro(cublasLtMatmulPreferenceDestroy);         \
-  __macro(cublasLtMatmulPreferenceSetAttribute);    \
-  __macro(cublasLtMatmulAlgoGetHeuristic);          \
-  __macro(cublasLtMatrixTransform);                 \
-  __macro(cublasLtMatrixTransformDescCreate);       \
-  __macro(cublasLtMatrixTransformDescDestroy);      \
-  __macro(cublasLtMatrixTransformDescSetAttribute); \
-  __macro(cublasLtMatmulAlgoInit);                  \
-  __macro(cublasLtMatmulAlgoConfigSetAttribute);    \
-  __macro(cublasLtMatmulAlgoConfigGetAttribute);    \
-  __macro(cublasLtMatmulAlgoGetIds);                \
-  __macro(cublasLtMatmulAlgoCapGetAttribute);       \
-  __macro(cublasLtMatmulAlgoCheck);
-// __macro(cublasLtGetCudartVersion);
-#else
-#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
-  __macro(cublasLtCreate);                       \
-  __macro(cublasLtDestroy);                      \
-  __macro(cublasLtMatmul);                       \
-  __macro(cublasLtMatmulDescCreate);             \
-  __macro(cublasLtMatmulDescDestroy);            \
-  __macro(cublasLtMatmulDescSetAttribute);       \
-  __macro(cublasLtMatmulDescGetAttribute);       \
-  __macro(cublasLtMatrixLayoutCreate);           \
-  __macro(cublasLtMatrixLayoutDestroy);          \
-  __macro(cublasLtMatrixLayoutSetAttribute);     \
-  __macro(cublasLtMatrixLayoutGetAttribute);     \
-  __macro(cublasLtMatmulPreferenceCreate);       \
-  __macro(cublasLtMatmulPreferenceDestroy);      \
-  __macro(cublasLtMatmulPreferenceSetAttribute); \
-  __macro(cublasLtMatmulAlgoGetHeuristic);       \
-  __macro(cublasLtMatrixTransform);              \
-  __macro(cublasLtMatrixTransformDescCreate);    \
-  __macro(cublasLtMatrixTransformDescDestroy);   \
-  __macro(cublasLtMatrixTransformDescSetAttribute);
-#endif
-
-CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
-// #endif
-
-#undef DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP
-}  // namespace dynload
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/cublaslt.h b/backends/metax_gpu/kernels/funcs/blas/cublaslt.h
deleted file mode 100755
index 24505567baf..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/cublaslt.h
+++ /dev/null
@@ -1,328 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <sstream>
-#include <string>
-#include <unordered_map>
-
-#include "./cublasLt.h"
-#include "paddle/phi/common/float8_e4m3fn.h"
-#include "paddle/phi/core/dense_tensor.h"
-
-namespace dyl = phi::dynload;
-
-namespace phi {
-
-struct CublasLtAlgoParam {
-  int algoId;
-  int swizzle;
-  int customOption;
-  int tile;
-  int splitK_val;
-  int reductionScheme;
-  int stages;
-  size_t workspace_size;
-};
-
-const std::map<std::tuple<int, int, int>, CublasLtAlgoParam> AlgoParamCache{};
-
-class CublasLtHelper {
- public:
-  CublasLtHelper(int m, int k, int n, cublasLtHandle_t handle)
-      : handle_(handle), alpha_(1), beta_(0), m_(m), k_(k), n_(n) {
-    cublasStatus_t status;
-
-    cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
-
-    // matmul desc
-    status = dyl::cublasLtMatmulDescCreate(
-        &matmul_desc_, cudaComputeType, CUDA_R_32I);
-
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatmulDescCreate execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-    cublasOperation_t op_transpose = CUBLAS_OP_T;
-    status = dyl::cublasLtMatmulDescSetAttribute(matmul_desc_,
-                                                 CUBLASLT_MATMUL_DESC_TRANSA,
-                                                 &op_transpose,
-                                                 sizeof(op_transpose));
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatmulDescSetAttribute execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-
-    // matrix desc
-    status = dyl::cublasLtMatrixLayoutCreate(&B_desc_, CUDA_R_8I, k, n, k);
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatrixLayoutCreate execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-
-    status = dyl::cublasLtMatrixLayoutCreate(&A_desc_, CUDA_R_8I, k, m, k);
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatrixLayoutCreate execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-
-    status = dyl::cublasLtMatrixLayoutCreate(&C_desc_, CUDA_R_32I, n, m, n);
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatrixLayoutCreate execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-
-#if CUDA_VERSION >= 11020
-
-    int algoId = 21;
-    int swizzle = 0;
-    int customOption = 0;
-    int tile = 15;
-    int splitK_val = 0;
-    int reductionScheme = 0;
-    int stages = 23;
-    workspace_size_ = 0;
-    if (m >= 128) {
-      tile = 20;
-      stages = 17;
-    }
-
-    std::tuple<int, int, int> key(m_, k_, n_);
-    if (AlgoParamCache.count(key) != 0) {
-      auto value = AlgoParamCache.at(key);
-      algoId = value.algoId;
-      swizzle = value.swizzle;
-      customOption = value.customOption;
-      tile = value.tile;
-      splitK_val = value.splitK_val;
-      reductionScheme = value.reductionScheme;
-      stages = value.stages;
-      workspace_size_ = value.workspace_size;
-    }
-
-    dyl::cublasLtMatmulAlgoInit(handle_,
-                                cudaComputeType,
-                                CUDA_R_32I,
-                                CUDA_R_8I,
-                                CUDA_R_8I,
-                                CUDA_R_32I,
-                                CUDA_R_32I,
-                                algoId,
-                                &algo_);
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(
-        &algo_,
-        CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
-        &(customOption),
-        sizeof(customOption));
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(
-        &algo_, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(&algo_,
-                                              CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
-                                              &(splitK_val),
-                                              sizeof(splitK_val));
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(
-        &algo_,
-        CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING,
-        &(swizzle),
-        sizeof(swizzle));
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(
-        &algo_,
-        CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
-        &(reductionScheme),
-        sizeof(int));
-#if CUDA_VERSION >= 11000
-    dyl::cublasLtMatmulAlgoConfigSetAttribute(
-        &algo_, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
-#endif
-#endif
-  }
-  ~CublasLtHelper() {}
-
-  void GEMM(const int8_t* A_dev,
-            const int8_t* B_dev,
-            int32_t* C_dev,
-            cudaStream_t stream,
-            void* workspace = nullptr) {
-    cublasStatus_t status;
-
-    status = dyl::cublasLtMatmul(handle_,
-                                 matmul_desc_,
-                                 &alpha_,
-                                 B_dev,
-                                 B_desc_,
-                                 A_dev,
-                                 A_desc_,
-                                 &beta_,
-                                 C_dev,
-                                 C_desc_,
-                                 C_dev,
-                                 C_desc_,
-#if CUDA_VERSION >= 11020
-                                 &algo_,
-                                 workspace,
-                                 workspace_size_,
-#else
-                                 nullptr,
-                                 nullptr,
-                                 0,
-#endif
-                                 stream);
-    PADDLE_ENFORCE_EQ(
-        status,
-        CUBLAS_STATUS_SUCCESS,
-        common::errors::External(
-            "cublasLtMatmul execution error"
-            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
-            "information"));
-  }
-
- private:
-  cublasLtHandle_t handle_;
-  cublasLtMatmulDesc_t matmul_desc_;
-  cublasLtMatrixLayout_t A_desc_;
-  cublasLtMatrixLayout_t B_desc_;
-  cublasLtMatrixLayout_t C_desc_;
-
-  cublasLtMatmulAlgo_t algo_;
-
-  int32_t alpha_ = 1;
-  int32_t beta_ = 0;
-
-  int m_ = 0;
-  int k_ = 0;
-  int n_ = 0;
-
-  size_t workspace_size_ = 0;
-};
-
-template <typename T>
-inline cudaDataType_t GetCublasLtDataType() {
-  return CUDA_R_32F;
-}
-
-template <>
-inline cudaDataType_t GetCublasLtDataType<phi::dtype::float16>() {
-  return CUDA_R_16F;
-}
-
-template <>
-inline cudaDataType_t GetCublasLtDataType<phi::dtype::bfloat16>() {
-  return CUDA_R_16BF;
-}
-
-#if CUDA_VERSION >= 12010
-template <typename T>
-void CublasLtMatmulFP8(const phi::GPUContext& dev_ctx,
-                       const phi::DenseTensor& mat_a,
-                       const phi::DenseTensor& mat_b,
-                       phi::DenseTensor* workspace,
-                       phi::DenseTensor* out) {
-  int m = mat_a.dims()[0];
-  int k = mat_a.dims()[1];
-  int n = mat_b.dims()[1];
-
-  // init data structure
-  cublasStatus_t status;
-  auto A_type = CUDA_R_8F_E4M3;
-  auto B_type = CUDA_R_8F_E4M3;
-  auto C_type = GetCublasLtDataType<T>();
-
-  cublasLtMatmulDesc_t matmul_desc_;
-  cublasLtMatrixLayout_t A_desc_;
-  cublasLtMatrixLayout_t B_desc_;
-  cublasLtMatrixLayout_t C_desc_;
-  float alpha_ = 1.0f;
-  float beta_ = 0.0f;
-
-  cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32F;
-  status =
-      dyl::cublasLtMatmulDescCreate(&matmul_desc_, cudaComputeType, CUDA_R_32F);
-  cublasOperation_t op_transpose = CUBLAS_OP_T;
-  status = dyl::cublasLtMatmulDescSetAttribute(matmul_desc_,
-                                               CUBLASLT_MATMUL_DESC_TRANSA,
-                                               &op_transpose,
-                                               sizeof(op_transpose));
-  status = dyl::cublasLtMatrixLayoutCreate(&B_desc_, B_type, k, n, k);
-  status = dyl::cublasLtMatrixLayoutCreate(&A_desc_, A_type, k, m, k);
-  status = dyl::cublasLtMatrixLayoutCreate(&C_desc_, C_type, n, m, n);
-
-  // Need to use heuristic
-  int returnedResults = 0;
-  cublasLtMatmulHeuristicResult_t heuristicResult = {};
-  cublasLtMatmulPreference_t preference = NULL;
-  size_t work_space_size = workspace->numel();
-
-  status = dyl::cublasLtMatmulPreferenceCreate(&preference);
-  status = dyl::cublasLtMatmulPreferenceSetAttribute(
-      preference,
-      CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-      &work_space_size,
-      sizeof(work_space_size));
-
-  status = dyl::cublasLtMatmulAlgoGetHeuristic(dev_ctx.cublaslt_handle(),
-                                               matmul_desc_,
-                                               B_desc_,
-                                               A_desc_,
-                                               C_desc_,
-                                               C_desc_,
-                                               preference,
-                                               1,
-                                               &heuristicResult,
-                                               &returnedResults);
-
-  PADDLE_ENFORCE_NE(returnedResults,
-                    0,
-                    common::errors::NotFound(
-                        "Unable to find suitable cuBLAS GEMM algorithm"));
-
-  status =
-      dyl::cublasLtMatmul(dev_ctx.cublaslt_handle(),
-                          matmul_desc_,
-                          &alpha_,
-                          mat_b.data<phi::dtype::float8_e4m3fn>(),
-                          B_desc_,
-                          mat_a.data<phi::dtype::float8_e4m3fn>(),
-                          A_desc_,
-                          &beta_,
-                          out->data<T>(),
-                          C_desc_,
-                          out->data<T>(),
-                          C_desc_,
-                          // nullptr,
-                          &heuristicResult.algo,
-                          //  nullptr,
-                          reinterpret_cast<void*>(workspace->data<int8_t>()),
-                          // 0,
-                          work_space_size,
-                          dev_ctx.stream());
-}
-#endif
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/funcs/blas/port.cc b/backends/metax_gpu/kernels/funcs/blas/port.cc
deleted file mode 100644
index bc6d54e5c5f..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/port.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved. Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// clang-format off
-#include "port.h" // NOLINT
-
-#include <array>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include "glog/logging.h"
-#if !defined(_WIN32)
-#include <dlfcn.h>  // dladdr
-#include <sys/stat.h>
-#include <sys/time.h>
-
-#else
-#include <numeric>  // std::accumulate in msvc
-// clang-format on
-void *dlsym(void *handle, const char *symbol_name) {
-  FARPROC found_symbol;
-  found_symbol = GetProcAddress((HMODULE)handle, symbol_name);
-
-  if (found_symbol == NULL) {
-    LOG(ERROR) << "Load symbol " << symbol_name << " failed.";
-    throw std::runtime_error(std::string(symbol_name) + " not found.");
-  }
-  return reinterpret_cast<void *>(found_symbol);
-}
-
-void *dlopen(const char *filename, int flag) {
-  std::string file_name(filename);
-  HMODULE hModule = LoadLibrary(file_name.c_str());
-  if (!hModule) {
-    if (flag) {
-      throw std::runtime_error(file_name + " not found.");
-    } else {
-      return nullptr;
-    }
-  }
-  return reinterpret_cast<void *>(hModule);
-}
-
-int gettimeofday(struct timeval *tp, void *tzp) {
-  time_t clock;
-  struct tm tm;
-  SYSTEMTIME wtm;
-
-  GetLocalTime(&wtm);
-  tm.tm_year = wtm.wYear - 1900;
-  tm.tm_mon = wtm.wMonth - 1;
-  tm.tm_mday = wtm.wDay;
-  tm.tm_hour = wtm.wHour;
-  tm.tm_min = wtm.wMinute;
-  tm.tm_sec = wtm.wSecond;
-  tm.tm_isdst = -1;
-  clock = mktime(&tm);
-  tp->tv_sec = clock;
-  tp->tv_usec = wtm.wMilliseconds * 1000;
-
-  return (0);
-}
-#endif  // !_WIN32
-
-void ExecShellCommand(const std::string &cmd, std::string *message) {
-  std::array<char, 128> buffer;
-#if !defined(_WIN32)
-  std::shared_ptr<FILE> pipe(popen(cmd.c_str(), "r"), pclose);
-#else
-  std::shared_ptr<FILE> pipe(_popen(cmd.c_str(), "r"), _pclose);
-#endif  // _WIN32
-  if (!pipe) {
-    LOG(ERROR) << "error running command: " << cmd;
-    return;
-  }
-  while (!feof(pipe.get())) {
-    if (fgets(buffer.data(), 128, pipe.get()) != nullptr) {
-      *message += buffer.data();
-    }
-  }
-}
-
-bool PathExists(const std::string &path) {
-#if !defined(_WIN32)
-  struct stat statbuf;
-  if (stat(path.c_str(), &statbuf) != -1) {
-    if (S_ISDIR(statbuf.st_mode)) {
-      return true;
-    }
-  }
-#else
-  struct _stat statbuf;
-  if (_stat(path.c_str(), &statbuf) != -1) {
-    if (S_ISDIR(statbuf.st_mode)) {
-      return true;
-    }
-  }
-#endif  // !_WIN32
-  return false;
-}
-
-#if !defined(_WIN32)
-constexpr char kSEP = '/';
-#else
-constexpr char kSEP = '\\';
-#endif  // _WIN32
-
-bool FileExists(const std::string &filepath) {
-#if !defined(_WIN32)
-  struct stat buffer;
-  return (stat(filepath.c_str(), &buffer) == 0);
-#else
-  struct _stat buffer;
-  return (_stat(filepath.c_str(), &buffer) == 0);
-#endif  // !_WIN32
-}
-
-std::string DirName(const std::string &filepath) {
-  auto pos = filepath.rfind(kSEP);
-  if (pos == std::string::npos) {
-    return "";
-  }
-  return filepath.substr(0, pos);
-}
-
-void MkDir(const char *path) {
-  std::string path_error(path);
-  path_error += " mkdir failed!";
-#if !defined(_WIN32)
-  if (mkdir(path, 0755)) {
-    if (errno != EEXIST) {
-      throw std::runtime_error(path_error);
-    }
-  }
-#else
-  BOOL return_value = CreateDirectory(path, NULL);
-  if (!return_value) {
-    auto errorno = GetLastError();
-    if (errorno != ERROR_ALREADY_EXISTS) {
-      throw std::runtime_error(path_error);
-    }
-  }
-#endif  // !_WIN32
-}
-
-void MkDirRecursively(const char *fullpath) {
-  if (*fullpath == '\0') return;  // empty string
-  if (FileExists(fullpath)) return;
-
-  MkDirRecursively(DirName(fullpath).c_str());
-  MkDir(fullpath);
-}
diff --git a/backends/metax_gpu/kernels/funcs/blas/port.h b/backends/metax_gpu/kernels/funcs/blas/port.h
deleted file mode 100644
index d2a59199bb7..00000000000
--- a/backends/metax_gpu/kernels/funcs/blas/port.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved. Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-
-#if !defined(_WIN32)
-#include <dlfcn.h>  // dladdr
-#include <sys/time.h>
-
-#else
-#ifndef NOMINMAX
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#endif
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
-#include <io.h>  // _popen, _pclose
-#include <stdio.h>
-#include <windows.h>
-#include <winsock.h>
-
-#ifndef S_ISDIR  // windows port for sys/stat.h
-#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
-#endif  // S_ISDIR
-
-void *dlsym(void *handle, const char *symbol_name);
-
-void *dlopen(const char *filename, int flag);
-
-int gettimeofday(struct timeval *tp, void *tzp);
-#endif  // !_WIN32
-
-void ExecShellCommand(const std::string &cmd, std::string *message);
-
-bool PathExists(const std::string &path);
-
-// TODO(yuyang18): If the functions below are needed by other files, move them
-// to paddle::filesystem namespace.
-bool FileExists(const std::string &filepath);
-
-std::string DirName(const std::string &filepath);
-
-void MkDir(const char *path);
-
-void MkDirRecursively(const char *fullpath);
diff --git a/backends/metax_gpu/kernels/funcs/layer_norm_util.h b/backends/metax_gpu/kernels/funcs/layer_norm_util.h
index 3e16e615b1d..0f8210d8b8f 100644
--- a/backends/metax_gpu/kernels/funcs/layer_norm_util.h
+++ b/backends/metax_gpu/kernels/funcs/layer_norm_util.h
@@ -18,7 +18,7 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/device_context.h"
-#include "../funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 // clang-format on
 namespace phi {
diff --git a/backends/metax_gpu/kernels/funcs/quant_dequant.h b/backends/metax_gpu/kernels/funcs/quant_dequant.h
deleted file mode 100644
index 301ae351c40..00000000000
--- a/backends/metax_gpu/kernels/funcs/quant_dequant.h
+++ /dev/null
@@ -1,430 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include <vector>
-#include "paddle/common/hostdevice.h"
-#include "paddle/phi/backends/gpu/gpu_launch_config.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/common/transform.h"
-#include "paddle/phi/kernels/funcs/aligned_vector.h"
-#include "blas/blas.h"
-// clang-format on
-namespace phi {
-
-using backends::gpu::GpuLaunchConfig;
-
-constexpr int DequantKernelVecSize = 4;
-
-template <typename T>
-inline HOSTDEVICE T roundWithTiesToEven(T x) {
-  T xLower = floor(x);
-  T xUpper = ceil(x);
-  // x is in interval [xl,xu]. Choose closest of two bounds, breaking ties to
-  // even.
-  T dLower = x - xLower;
-  T dUpper = xUpper - x;
-  return static_cast<T>(
-      (dLower == dUpper ? fmod(xLower, 2.0F) == 0.0F : dLower < dUpper)
-          ? xLower
-          : xUpper);
-}
-
-template <typename T>
-inline HOSTDEVICE T roundWithTiesAwayFromZero(T x) {
-  return static_cast<T>(x > 0 ? ceil(x) : floor(x));
-}
-
-template <typename T>
-__forceinline__ __device__ int8_t quant_helper(const T input,
-                                               const float scale,
-                                               const int round_type,
-                                               const float max_bound,
-                                               const float min_bound) {
-  float quant_value = max_bound * scale * static_cast<float>(input);
-
-  if (round_type == 0) {
-    quant_value = static_cast<float>(roundWithTiesToEven(quant_value));
-  } else {
-    quant_value = static_cast<float>(round(quant_value));
-  }
-  quant_value = quant_value > max_bound ? max_bound : quant_value;
-  quant_value = quant_value < min_bound ? min_bound : quant_value;
-  return static_cast<int8_t>(quant_value);
-}
-
-template <typename T>
-__forceinline__ __device__ int8_t
-quant_helper_ties_to_even_or_away_from_zero(const T input,
-                                            const float scale,
-                                            const int round_type,
-                                            const float max_bound,
-                                            const float min_bound) {
-  float quant_value = max_bound * scale * static_cast<float>(input);
-
-  if (round_type == 0) {
-    quant_value = static_cast<float>(roundWithTiesToEven(quant_value));
-  } else {
-    quant_value = static_cast<float>(roundWithTiesAwayFromZero(quant_value));
-  }
-  quant_value = quant_value > max_bound ? max_bound : quant_value;
-  quant_value = quant_value < min_bound ? min_bound : quant_value;
-  return static_cast<int8_t>(quant_value);
-}
-
-template <typename T>
-__global__ void QuantKernel(const T* input,
-                            char4* output,
-                            const float scale,
-                            const int m,
-                            const int n,
-                            const int round_type,
-                            const float max_bound,
-                            const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
-
-  bool check = ((m_id < m) && (n_id < n));
-  if (check) {
-    char4 tmp;
-    tmp.x = quant_helper(
-        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
-    tmp.y = quant_helper(
-        input[m_id * n + n_id + 1], scale, round_type, max_bound, min_bound);
-    tmp.z = quant_helper(
-        input[m_id * n + n_id + 2], scale, round_type, max_bound, min_bound);
-    tmp.w = quant_helper(
-        input[m_id * n + n_id + 3], scale, round_type, max_bound, min_bound);
-    output[(m_id * n + n_id) >> 2] = tmp;
-  }
-}
-
-template <typename T>
-__global__ void QuantKernelWithVecSize(const T* input,
-                                       char4* output,
-                                       const float scale,
-                                       const int m,
-                                       const int n,
-                                       const int round_type,
-                                       const float max_bound,
-                                       const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
-
-  bool check = ((m_id < m) && (n_id < n));
-  if (check) {
-    char4 tmp;
-    tmp.x = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
-    tmp.y = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 1], scale, round_type, max_bound, min_bound);
-    tmp.z = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 2], scale, round_type, max_bound, min_bound);
-    tmp.w = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 3], scale, round_type, max_bound, min_bound);
-    output[(m_id * n + n_id) >> 2] = tmp;
-  }
-}
-
-template <typename T>
-__global__ void QuantKernelWithVecSize(const T* input,
-                                       char3* output,
-                                       const float scale,
-                                       const int m,
-                                       const int n,
-                                       const int round_type,
-                                       const float max_bound,
-                                       const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) * 3;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
-
-  bool check = ((m_id < m) && (n_id < n));
-  if (check) {
-    char3 tmp;
-    tmp.x = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
-    tmp.y = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 1], scale, round_type, max_bound, min_bound);
-    tmp.z = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 2], scale, round_type, max_bound, min_bound);
-    output[(m_id * n + n_id) / 3] = tmp;
-  }
-}
-
-template <typename T>
-__global__ void QuantKernelWithVecSize(const T* input,
-                                       char2* output,
-                                       const float scale,
-                                       const int m,
-                                       const int n,
-                                       const int round_type,
-                                       const float max_bound,
-                                       const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) * 2;
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
-
-  bool check = ((m_id < m) && (n_id < n));
-  if (check) {
-    char2 tmp;
-    tmp.x = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
-    tmp.y = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id + 1], scale, round_type, max_bound, min_bound);
-    output[(m_id * n + n_id) >> 1] = tmp;
-  }
-}
-
-template <typename T>
-__global__ void QuantKernelWithVecSize(const T* input,
-                                       char* output,
-                                       const float scale,
-                                       const int m,
-                                       const int n,
-                                       const int round_type,
-                                       const float max_bound,
-                                       const float min_bound) {
-  int n_id = (blockIdx.x * blockDim.x + threadIdx.x);
-  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
-
-  bool check = ((m_id < m) && (n_id < n));
-  if (check) {
-    char tmp;
-    tmp = quant_helper_ties_to_even_or_away_from_zero(
-        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
-    output[m_id * n + n_id] = tmp;
-  }
-}
-
-template <typename T>
-void LaunchQuantKernel(const T* input,
-                       int8_t* output,
-                       const float scale,
-                       const int m,
-                       const int n,
-                       const int round_type,
-                       const float max_bound,
-                       const float min_bound,
-                       gpuStream_t stream) {
-  // TODO(minghaoBD): optimize the kennel launch times when m==1 or n==1
-#ifdef PADDLE_WITH_HIP
-  dim3 grid(((n >> 2) + 63) / 64, (m + 7) / 8);
-  dim3 block(64, 8);
-#else
-  dim3 grid(((n >> 2) + 31) / 32, (m + 31) / 32);
-  dim3 block(32, 32);
-#endif
-
-  QuantKernel<<<grid, block, 0, stream>>>(input,
-                                          (char4*)output,  // NOLINT
-                                          scale,
-                                          m,
-                                          n,
-                                          round_type,
-                                          max_bound,
-                                          min_bound);
-}
-
-template <typename T>
-void LaunchQuantKernelWithVecSize(const T* input,
-                                  int8_t* output,
-                                  const float scale,
-                                  const int m,
-                                  const int n,
-                                  const int round_type,
-                                  const float max_bound,
-                                  const float min_bound,
-                                  gpuStream_t stream) {
-  int vec_size = 1;
-  if (n % 4 == 0) {
-    vec_size = 4;
-  } else if (n % 3 == 0) {
-    vec_size = 3;
-  } else if (n % 2 == 0) {
-    vec_size = 2;
-  }
-
-#ifdef PADDLE_WITH_HIP
-  dim3 grid(((n / vec_size) + 63) / 64, (m + 7) / 8);
-  dim3 block(64, 8);
-#else
-  dim3 grid(((n / vec_size) + 31) / 32, (m + 31) / 32);
-  dim3 block(32, 32);
-#endif
-
-  switch (vec_size) {
-    case 4:
-      QuantKernelWithVecSize<<<grid, block, 0, stream>>>(
-          input,
-          reinterpret_cast<char4*>(output),
-          scale,
-          m,
-          n,
-          round_type,
-          max_bound,
-          min_bound);
-      break;
-    case 3:
-      QuantKernelWithVecSize<<<grid, block, 0, stream>>>(
-          input,
-          reinterpret_cast<char3*>(output),
-          scale,
-          m,
-          n,
-          round_type,
-          max_bound,
-          min_bound);
-      break;
-    case 2:
-      QuantKernelWithVecSize<<<grid, block, 0, stream>>>(
-          input,
-          reinterpret_cast<char2*>(output),
-          scale,
-          m,
-          n,
-          round_type,
-          max_bound,
-          min_bound);
-      break;
-    case 1:
-      QuantKernelWithVecSize<<<grid, block, 0, stream>>>(
-          input,
-          reinterpret_cast<char*>(output),
-          scale,
-          m,
-          n,
-          round_type,
-          max_bound,
-          min_bound);
-      break;
-    default:
-      return;
-  }
-}
-
-template <typename T, int VecSize>
-__global__ void DequantKernel(T* output,
-                              const int32_t* input,
-                              const int m,  // batch size
-                              const int n,  // hidden
-                              const float quant_in_scale,
-                              const float* dequant_out_scale_data) {
-  int numel = m * n;
-  int stride = blockDim.x * gridDim.x * VecSize;
-  int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
-  int col_id = idx % n;
-
-  phi::AlignedVector<int32_t, VecSize> in_vec;
-  phi::AlignedVector<float, VecSize> out_scale_vec;
-  phi::AlignedVector<T, VecSize> out_vec;
-
-  for (; idx < numel; idx += stride) {
-    phi::Load<int32_t, VecSize>(input + idx, &in_vec);
-    phi::Load<float, VecSize>(dequant_out_scale_data + col_id, &out_scale_vec);
-
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      out_vec[i] =
-          static_cast<T>(static_cast<float>(in_vec[i]) * out_scale_vec[i]);
-    }
-
-    phi::Store<T, VecSize>(out_vec, output + idx);
-  }
-}
-
-template <typename T>
-void LaunchDequantKernel(const int32_t* input,
-                         T* output,
-                         const int m,  // m
-                         const int n,  // n
-                         gpuStream_t stream,
-                         GpuLaunchConfig* gpu_config,
-                         const float quant_in_scale,
-                         const float* dequant_out_scale_data) {
-  DequantKernel<T, DequantKernelVecSize>
-      <<<gpu_config->block_per_grid, gpu_config->thread_per_block, 0, stream>>>(
-          output, input, m, n, quant_in_scale, dequant_out_scale_data);
-}
-
-template <typename T, int VecSize>
-__global__ void DequantKernelWithScaleOfInputAndWeight(
-    T* output,
-    const int32_t* input,
-    const int m,  // batch size
-    const int n,  // hidden
-    const float quant_in_scale,
-    const float* quant_weight_scale,
-    float quant_max_bound) {
-  int numel = m * n;
-  int stride = blockDim.x * gridDim.x * VecSize;
-  int idx = (blockIdx.x * blockDim.x + threadIdx.x) * VecSize;
-  int col_id = idx % n;
-
-  phi::AlignedVector<int32_t, VecSize> in_vec;
-  phi::AlignedVector<float, VecSize> out_scale_vec;
-  phi::AlignedVector<T, VecSize> out_vec;
-
-  for (; idx < numel; idx += stride) {
-    phi::Load<int32_t, VecSize>(input + idx, &in_vec);
-    phi::Load<float, VecSize>(quant_weight_scale + col_id, &out_scale_vec);
-
-#pragma unroll
-    for (int i = 0; i < VecSize; ++i) {
-      out_vec[i] = static_cast<T>(static_cast<float>(in_vec[i]) /
-                                  (quant_max_bound * quant_max_bound *
-                                   quant_in_scale * out_scale_vec[i]));
-    }
-
-    phi::Store<T, VecSize>(out_vec, output + idx);
-  }
-}
-
-template <typename T>
-void LaunchDequantKernelWithScaleOfInputAndWeight(
-    const int32_t* input,
-    T* output,
-    const int m,  // m
-    const int n,  // n
-    gpuStream_t stream,
-    GpuLaunchConfig* gpu_config,
-    const float quant_in_scale,
-    const float* quant_weight_scale,
-    float quant_max_bound) {
-  if (n % DequantKernelVecSize != 0) {
-    DequantKernelWithScaleOfInputAndWeight<T, 1><<<gpu_config->block_per_grid,
-                                                   gpu_config->thread_per_block,
-                                                   0,
-                                                   stream>>>(output,
-                                                             input,
-                                                             m,
-                                                             n,
-                                                             quant_in_scale,
-                                                             quant_weight_scale,
-                                                             quant_max_bound);
-    return;
-  }
-  DequantKernelWithScaleOfInputAndWeight<T, DequantKernelVecSize>
-      <<<gpu_config->block_per_grid, gpu_config->thread_per_block, 0, stream>>>(
-          output,
-          input,
-          m,
-          n,
-          quant_in_scale,
-          quant_weight_scale,
-          quant_max_bound);
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/gpudnn/cudnn.cc b/backends/metax_gpu/kernels/gpudnn/cudnn.cc
deleted file mode 100644
index dc403282c1c..00000000000
--- a/backends/metax_gpu/kernels/gpudnn/cudnn.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/backends/dynload/cudnn.h"  // NOLINT
-
-#include "paddle/phi/core/enforce.h"
-
-namespace phi::dynload {
-
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
-
-#define DEFINE_WRAP(__name) DynLoad__##__name __name
-
-CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R7
-CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7
-CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R8
-CUDNN_DNN_ROUTINE_EACH_R8(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_FRONTEND
-CUDNN_DNN_ROUTINE_EACH_FRONTEND(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9
-CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9
-CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7_REMOVED_IN_E9(DEFINE_WRAP);
-#endif
-
-#ifdef CUDNN_DNN_ROUTINE_EACH_R9
-CUDNN_DNN_ROUTINE_EACH_R9(DEFINE_WRAP);
-#endif
-
-bool HasCUDNN() {
-  std::call_once(cudnn_dso_flag,
-                 []() { cudnn_dso_handle = GetCUDNNDsoHandle(); });
-  return cudnn_dso_handle != nullptr;
-}
-
-void EnforceCUDNNLoaded(const char* fn_name) {
-  PADDLE_ENFORCE_NOT_NULL(
-      cudnn_dso_handle,
-      common::errors::PreconditionNotMet(
-          "Cannot load cudnn shared library. Cannot invoke method %s.",
-          fn_name));
-}
-
-}  // namespace phi::dynload
diff --git a/backends/metax_gpu/kernels/gpudnn/cudnn.h b/backends/metax_gpu/kernels/gpudnn/cudnn.h
deleted file mode 100644
index 65cb6b338b7..00000000000
--- a/backends/metax_gpu/kernels/gpudnn/cudnn.h
+++ /dev/null
@@ -1,218 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef PADDLE_WITH_CUDA
-#include <cudnn.h>
-
-#include <mutex>  // NOLINT
-
-#include "paddle/phi/backends/dynload/dynamic_loader.h"
-#include "paddle/phi/common/port.h"
-
-namespace phi {
-namespace dynload {
-
-extern std::once_flag cudnn_dso_flag;
-extern void* cudnn_dso_handle;
-extern bool HasCUDNN();
-
-extern void EnforceCUDNNLoaded(const char* fn_name);
-#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                      \
-  struct DynLoad__##__name {                                         \
-    template <typename... Args>                                      \
-    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
-      using cudnn_func = decltype(&::__name);                        \
-      std::call_once(cudnn_dso_flag, []() {                          \
-        cudnn_dso_handle = phi::dynload::GetCUDNNDsoHandle();        \
-      });                                                            \
-      EnforceCUDNNLoaded(#__name);                                   \
-      std::string replaced_name = #__name;                           \
-      replaced_name = replaced_name.replace(0, 2, "mc");             \
-      static void* p_##__name =                                      \
-          dlsym(cudnn_dso_handle, replaced_name.c_str());            \
-      return reinterpret_cast<cudnn_func>(p_##__name)(args...);      \
-    }                                                                \
-  };                                                                 \
-  extern struct DynLoad__##__name __name
-
-/**
- * include all needed cudnn functions in HPPL
- * different cudnn version has different interfaces
- **/
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                    \
-  __macro(cudnnSetTensor4dDescriptor);                     \
-  __macro(cudnnSetTensor4dDescriptorEx);                   \
-  __macro(cudnnSetTensorNdDescriptor);                     \
-  __macro(cudnnGetTensorNdDescriptor);                     \
-  __macro(cudnnGetConvolutionNdForwardOutputDim);          \
-  __macro(cudnnCreateTensorDescriptor);                    \
-  __macro(cudnnDestroyTensorDescriptor);                   \
-  __macro(cudnnCreateFilterDescriptor);                    \
-  __macro(cudnnSetFilter4dDescriptor);                     \
-  __macro(cudnnSetFilterNdDescriptor);                     \
-  __macro(cudnnGetFilterNdDescriptor);                     \
-  __macro(cudnnSetPooling2dDescriptor);                    \
-  __macro(cudnnSetPoolingNdDescriptor);                    \
-  __macro(cudnnGetPoolingNdDescriptor);                    \
-  __macro(cudnnDestroyFilterDescriptor);                   \
-  __macro(cudnnCreateConvolutionDescriptor);               \
-  __macro(cudnnCreatePoolingDescriptor);                   \
-  __macro(cudnnDestroyPoolingDescriptor);                  \
-  __macro(cudnnSetConvolution2dDescriptor);                \
-  __macro(cudnnDestroyConvolutionDescriptor);              \
-  __macro(cudnnSetConvolutionNdDescriptor);                \
-  __macro(cudnnGetConvolutionNdDescriptor);                \
-  __macro(cudnnDeriveBNTensorDescriptor);                  \
-  __macro(cudnnCreateSpatialTransformerDescriptor);        \
-  __macro(cudnnSetSpatialTransformerNdDescriptor);         \
-  __macro(cudnnDestroySpatialTransformerDescriptor);       \
-  __macro(cudnnSpatialTfGridGeneratorForward);             \
-  __macro(cudnnSpatialTfGridGeneratorBackward);            \
-  __macro(cudnnSpatialTfSamplerForward);                   \
-  __macro(cudnnSpatialTfSamplerBackward);                  \
-  __macro(cudnnCreate);                                    \
-  __macro(cudnnDestroy);                                   \
-  __macro(cudnnSetStream);                                 \
-  __macro(cudnnActivationForward);                         \
-  __macro(cudnnActivationBackward);                        \
-  __macro(cudnnConvolutionForward);                        \
-  __macro(cudnnConvolutionBackwardBias);                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize);        \
-  __macro(cudnnTransformTensor);                           \
-  __macro(cudnnPoolingForward);                            \
-  __macro(cudnnPoolingBackward);                           \
-  __macro(cudnnSoftmaxBackward);                           \
-  __macro(cudnnSoftmaxForward);                            \
-  __macro(cudnnGetVersion);                                \
-  __macro(cudnnFindConvolutionForwardAlgorithmEx);         \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx);  \
-  __macro(cudnnFindConvolutionBackwardFilterAlgorithm);    \
-  __macro(cudnnFindConvolutionBackwardDataAlgorithmEx);    \
-  __macro(cudnnGetErrorString);                            \
-  __macro(cudnnCreateDropoutDescriptor);                   \
-  __macro(cudnnDropoutGetStatesSize);                      \
-  __macro(cudnnSetDropoutDescriptor);                      \
-  __macro(cudnnRestoreDropoutDescriptor);                  \
-  __macro(cudnnCreateRNNDescriptor);                       \
-  __macro(cudnnGetRNNParamsSize);                          \
-  __macro(cudnnGetRNNWorkspaceSize);                       \
-  __macro(cudnnGetRNNTrainingReserveSize);                 \
-  __macro(cudnnRNNForwardTraining);                        \
-  __macro(cudnnRNNBackwardData);                           \
-  __macro(cudnnRNNBackwardWeights);                        \
-  __macro(cudnnRNNForwardInference);                       \
-  __macro(cudnnDestroyDropoutDescriptor);                  \
-  __macro(cudnnDestroyRNNDescriptor);                      \
-  __macro(cudnnSetTensorNdDescriptorEx);                   \
-  __macro(cudnnAddTensor);                                 \
-  __macro(cudnnConvolutionBackwardData);                   \
-  __macro(cudnnConvolutionBackwardFilter);                 \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);   \
-  __macro(cudnnBatchNormalizationForwardTraining);         \
-  __macro(cudnnBatchNormalizationForwardInference);        \
-  __macro(cudnnBatchNormalizationBackward);                \
-  __macro(cudnnCreateActivationDescriptor);                \
-  __macro(cudnnSetActivationDescriptor);                   \
-  __macro(cudnnGetActivationDescriptor);                   \
-  __macro(cudnnDestroyActivationDescriptor);               \
-  __macro(cudnnSetRNNDescriptor_v6);
-CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-
-#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);   \
-  __macro(cudnnGetConvolutionForwardAlgorithm);          \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm);     \
-  __macro(cudnnSetRNNDescriptor);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#if CUDNN_VERSION >= 7001
-#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                \
-  __macro(cudnnSetConvolutionGroupCount);                 \
-  __macro(cudnnSetConvolutionMathType);                   \
-  __macro(cudnnConvolutionBiasActivationForward);         \
-  __macro(cudnnCreateCTCLossDescriptor);                  \
-  __macro(cudnnDestroyCTCLossDescriptor);                 \
-  __macro(cudnnGetCTCLossDescriptor);                     \
-  __macro(cudnnSetCTCLossDescriptor);                     \
-  __macro(cudnnGetCTCLossWorkspaceSize);                  \
-  __macro(cudnnCTCLoss);                                  \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm_v7);   \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm_v7); \
-  __macro(cudnnGetConvolutionForwardAlgorithm_v7);        \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithmMaxCount);
-CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#if CUDNN_VERSION >= 7201
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
-  __macro(cudnnCreateRNNDataDescriptor);             \
-  __macro(cudnnDestroyRNNDataDescriptor);            \
-  __macro(cudnnSetRNNDataDescriptor);                \
-  __macro(cudnnSetRNNPaddingMode);                   \
-  __macro(cudnnRNNForwardTrainingEx);                \
-  __macro(cudnnRNNBackwardDataEx);                   \
-  __macro(cudnnRNNBackwardWeightsEx);                \
-  __macro(cudnnRNNForwardInferenceEx);
-CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#if CUDNN_VERSION >= 7401
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7(__macro)                     \
-  __macro(cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize); \
-  __macro(cudnnBatchNormalizationForwardTrainingEx);                 \
-  __macro(cudnnGetBatchNormalizationBackwardExWorkspaceSize);        \
-  __macro(cudnnBatchNormalizationBackwardEx);                        \
-  __macro(cudnnGetBatchNormalizationTrainingExReserveSpaceSize);
-CUDNN_DNN_ROUTINE_EACH_AFTER_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#if CUDNN_VERSION >= 8000
-#define CUDNN_DNN_ROUTINE_EACH_R8(__macro)            \
-  __macro(cudnnSetRNNDescriptor_v8);                  \
-  __macro(cudnnCreateFusedOpsPlan);                   \
-  __macro(cudnnCreateFusedOpsConstParamPack);         \
-  __macro(cudnnCreateFusedOpsVariantParamPack);       \
-  __macro(cudnnDestroyFusedOpsPlan);                  \
-  __macro(cudnnDestroyFusedOpsConstParamPack);        \
-  __macro(cudnnDestroyFusedOpsVariantParamPack);      \
-  __macro(cudnnFusedOpsExecute);                      \
-  __macro(cudnnSetFusedOpsConstParamPackAttribute);   \
-  __macro(cudnnSetFusedOpsVariantParamPackAttribute); \
-  __macro(cudnnMakeFusedOpsPlan);
-CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-#ifdef PADDLE_WITH_CUDNN_FRONTEND
-#define CUDNN_DNN_ROUTINE_EACH_FRONTEND(__macro) \
-  __macro(cudnnBackendCreateDescriptor);         \
-  __macro(cudnnBackendDestroyDescriptor);        \
-  __macro(cudnnBackendExecute);                  \
-  __macro(cudnnBackendFinalize);                 \
-  __macro(cudnnBackendGetAttribute);             \
-  __macro(cudnnBackendSetAttribute);             \
-  __macro(cudnnGetStream);                       \
-  __macro(cudnnReorderFilterAndBias);
-CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
-#endif
-
-}  // namespace dynload
-}  // namespace phi
-
-#endif
diff --git a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
index b517b719d49..a2c69b6adf0 100644
--- a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "glog/logging.h"
 
 #include "paddle/phi/kernels/addmm_kernel.h"
-#include "../funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 // clang-format on
diff --git a/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h
index 593c044fc76..1c52ea22e4e 100644
--- a/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/baddbmm_kernel_impl.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include <type_traits>
 
 #include "glog/logging.h"
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/kernels/baddbmm_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
diff --git a/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h
index ef61d48202f..b64f94bc7ef 100644
--- a/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/bilinear_grad_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h b/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h
index c124e84eb6d..48861d48932 100644
--- a/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/bilinear_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/utils/optional.h"
 
diff --git a/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h
index 543df3ee964..cd5978ae59f 100644
--- a/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/bmm_grad_kernel_impl.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
-#include "kernels/impl/matmul_grad_kernel_impl.h"
 #include "paddle/phi/kernels/bmm_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h
index 7b4164032b2..ce493b4908a 100644
--- a/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/bmm_kernel_impl.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/bmm_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h
index 02332652660..5d146dae8d5 100644
--- a/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/cholesky_grad_kernel_impl.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/cholesky_grad_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h
index 62115e9ee6a..098092767c4 100644
--- a/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/cholesky_solve_grad_kernel_impl.h
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/cholesky_solve_grad_kernel.h"
 #include "paddle/phi/kernels/cholesky_solve_kernel.h"
 #include "paddle/phi/kernels/complex_kernel.h"
@@ -22,6 +21,7 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h
index 25e0d93a6a4..6066720ab07 100644
--- a/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/conv_grad_kernel_impl.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/im2col.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/vol2col.h"
diff --git a/backends/metax_gpu/kernels/impl/conv_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_kernel_impl.h
index 2cf5fa166e7..4395e5d5782 100644
--- a/backends/metax_gpu/kernels/impl/conv_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/conv_kernel_impl.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/conv_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/im2col.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/vol2col.h"
diff --git a/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h b/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h
index c7c002d4e9e..aadc5d2b8a0 100644
--- a/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/conv_transpose_kernel_impl.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/common/ddim.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/kernels/conv_transpose_kernel.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #include "paddle/phi/kernels/funcs/im2col.h"
 #include "paddle/phi/kernels/funcs/slice.h"
diff --git a/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h
index d2419966342..b9931a89978 100644
--- a/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/common/hostdevice.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/impl/elementwise.h b/backends/metax_gpu/kernels/impl/elementwise.h
index 52a7709424b..b9f3d8af1c9 100644
--- a/backends/metax_gpu/kernels/impl/elementwise.h
+++ b/backends/metax_gpu/kernels/impl/elementwise.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h b/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h
index d4526922c7b..dc4059a7225 100644
--- a/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/flatten2_kernel_impl.h
@@ -15,10 +15,10 @@
 #pragma once
 #include <vector>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/flatten_grad_kernel.h"
 #include "paddle/phi/kernels/flatten_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/flatten2_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h b/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h
index 0929a327035..ef12141f911 100644
--- a/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/gru_unit_kernel_impl.h
@@ -16,10 +16,10 @@
 
 #include <memory>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/all_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/utils/optional.h"
 namespace phi {
diff --git a/backends/metax_gpu/kernels/impl/index_select_impl.h b/backends/metax_gpu/kernels/impl/index_select_impl.h
index 78284107d34..ac39cab2704 100644
--- a/backends/metax_gpu/kernels/impl/index_select_impl.h
+++ b/backends/metax_gpu/kernels/impl/index_select_impl.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include "glog/logging.h"
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h
index 85aff008b4e..64b56f2cd1c 100644
--- a/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/inverse_grad_kernel_impl.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/matrix_inverse.h"
 #include "paddle/phi/kernels/inverse_grad_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h b/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h
index 079548b4ad0..4a061fe4716 100644
--- a/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/lstm_kernel_impl.h
@@ -15,8 +15,8 @@
 #pragma once
 #include <string>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/detail/activation_functions.h"
 #include "paddle/phi/kernels/funcs/lstm_compute.h"
 #include "paddle/phi/kernels/funcs/lstm_utils.h"
diff --git a/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h
index e9ef47490bc..5a2e5d48a11 100644
--- a/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/lu_grad_kernel_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/impl/lu_kernel_impl.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
diff --git a/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h
index 21c711c53ef..24dee650dfe 100644
--- a/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/lu_solve_grad_kernel_impl.h
@@ -15,9 +15,9 @@
 #pragma once
 
 #include "paddle/phi/infermeta/binary.h"
-// #include "paddle/phi/kernels/funcs/blas/blas.h"
+// #include "paddle/phi/paddle/phi/kernels/funcs/blas/blas.h"
 
-#include "kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/matrix_solve.h"
 #include "paddle/phi/kernels/impl/lu_kernel_impl.h"
diff --git a/backends/metax_gpu/kernels/impl/matmul_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/matmul_grad_kernel_impl.h
deleted file mode 100644
index 823851666f1..00000000000
--- a/backends/metax_gpu/kernels/impl/matmul_grad_kernel_impl.h
+++ /dev/null
@@ -1,2042 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include "glog/logging.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/kernels/complex_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/full_kernel.h"
-#include "paddle/phi/kernels/funcs/reduce_function.h"
-#include "paddle/phi/kernels/funcs/reduce_functor.h"
-#include "paddle/phi/kernels/impl/dot_grad_kernel_impl.h"
-// #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
-#include "paddle/phi/kernels/reduce_sum_kernel.h"
-
-#include "../impl/matmul_kernel_impl.h"
-// clang-format on
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/phi/kernels/gpu/reduce.h"
-#endif
-
-namespace phi {
-
-template <typename Context, typename T>
-struct ReduceSumForMatmulGrad {
-  void operator()(const Context& dev_ctx,
-                  const DenseTensor& input,
-                  DenseTensor* output,
-                  const std::vector<int>& reduce_dims);
-};
-
-template <typename T>
-struct ReduceSumForMatmulGrad<CPUContext, T> {
-  void operator()(const CPUContext& dev_ctx,
-                  const DenseTensor& input,
-                  DenseTensor* output,
-                  const std::vector<int>& reduce_dims) {
-    std::vector<int64_t> reduce_dims_tmp(reduce_dims.begin(),
-                                         reduce_dims.end());
-    funcs::ReduceKernelImpl<CPUContext, T, T, phi::funcs::SumFunctor>(
-        dev_ctx, input, output, reduce_dims_tmp, true, false);
-  }
-};
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-struct ReduceSumForMatmulGrad<GPUContext, T> {
-  void operator()(const GPUContext& dev_ctx,
-                  const DenseTensor& input,
-                  DenseTensor* output,
-                  const std::vector<int>& reduce_dims) {
-    phi::SumKernel<T, GPUContext>(
-        dev_ctx, input, reduce_dims, input.dtype(), false, output);
-  }
-};
-#endif
-
-// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
-// Identity op if the tensor is not of rank 3.
-static DenseTensor FoldInitDims(const DenseTensor& input) {
-  DenseTensor output = input;
-  auto in_dims = input.dims();
-  if (in_dims.size() == 3) {
-    output.Resize({in_dims[0] * in_dims[1], in_dims[2]});
-  }
-  return output;
-}
-
-// Reshape a rank-3 tensor from P x M x N to M x (P * N).
-// (Warning: This requires transposing data and writes into new memory.)
-// Identity op if the tensor is not of rank 3.
-template <typename Context, typename T>
-static DenseTensor FoldHeadAndLastDims(const Context& dev_ctx,
-                                       const DenseTensor& input) {
-  auto in_dims = input.dims();
-  if (in_dims.size() != 3) {
-    return input;
-  }
-  DenseTensor output = EmptyLike<T, Context>(dev_ctx, input);
-  output.Resize({in_dims[1], in_dims[0], in_dims[2]});
-  std::vector<int> axis = {1, 0, 2};
-  funcs::Transpose<Context, T, 3> trans;
-  trans(dev_ctx, input, &output, axis);
-  output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
-  return output;
-}
-
-template <typename Context, typename T>
-typename std::enable_if<!std::is_integral<T>::value>::type MatMul(
-    const Context& dev_ctx,
-    const DenseTensor& a,
-    bool trans_a,
-    const DenseTensor& b,
-    bool trans_b,
-    DenseTensor* out,
-    bool flag = false) {
-  dev_ctx.template Alloc<T>(out);
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
-  if (a.dims().size() == 3 && b.dims().size() <= 2) {
-    // the transpose_X must be false, if is true, the transpose cost much time
-    if (!trans_a) {
-      mat_dim_a.height_ *= mat_dim_a.batch_size_;
-      mat_dim_a.batch_size_ = 0;
-    }
-  }
-  blas.MatMul(a.data<T>(),
-              mat_dim_a,
-              b.data<T>(),
-              mat_dim_b,
-              static_cast<T>(1),
-              dev_ctx.template Alloc<T>(out),
-              static_cast<T>(flag));
-}
-
-/**
- * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
- * original x_dim is returned.
- */
-static DDim RowMatrixFromVector(const DDim& x_dim) {
-  if (x_dim.size() > 1) {
-    return x_dim;
-  }
-  return common::make_ddim({1, x_dim[0]});
-}
-
-/**
- * Get column matrix shape from a vector shape. If the ran of y_dim > 1, the
- * original y_dim is returned.
- */
-static DDim ColumnMatrixFromVector(const DDim& y_dim) {
-  if (y_dim.size() > 1) {
-    return y_dim;
-  }
-  return common::make_ddim({y_dim[0], 1});
-}
-
-/**
- * Reshape a tensor to 3-D or 2-D tensor by matrix descriptor.
- *
- * The shape would be [BatchSize, H, W] or [H, W].
- * If transposed, `H,W` will be swapped.
- */
-static void ReshapeTensorIntoMatrixSequence(
-    DenseTensor* x, const phi::funcs::MatDescriptor& descriptor) {
-  int64_t h, w;
-  h = descriptor.height_;
-  w = descriptor.width_;
-  if (descriptor.trans_) {
-    std::swap(w, h);
-  }
-  if (descriptor.batch_size_) {
-    x->Resize({descriptor.batch_size_, h, w});
-  } else {
-    x->Resize({h, w});
-  }
-}
-
-static void ReshapeXYOutIntoMatrixSequence(DenseTensor* x,
-                                           DenseTensor* y,
-                                           DenseTensor* out,
-                                           bool trans_x,
-                                           bool trans_y) {
-  auto x_dim = RowMatrixFromVector(x->dims());
-  auto y_dim = ColumnMatrixFromVector(y->dims());
-  auto mat_dim_x = phi::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
-  auto mat_dim_y = phi::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y);
-  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
-    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
-  } else {
-    out->Resize({(std::max)(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
-                 mat_dim_x.height_,
-                 mat_dim_y.width_});
-  }
-
-  ReshapeTensorIntoMatrixSequence(x, mat_dim_x);
-  ReshapeTensorIntoMatrixSequence(y, mat_dim_y);
-}
-
-template <typename T, typename Context>
-void CalcInputGrad(const Context& dev_ctx,
-                   const DenseTensor& a,
-                   bool trans_a,
-                   bool is_fold_init_dims_a,
-                   const DenseTensor& b,
-                   bool trans_b,
-                   bool is_fold_init_dims_b,
-                   DenseTensor* out,
-                   bool flag = false) {
-  if (out == nullptr) return;
-  bool need_combine =
-      (a.dims().size() == 3 || b.dims().size() == 3) && out->dims().size() == 2;
-  if (!need_combine) {
-    MatMul<Context, T>(dev_ctx, a, trans_a, b, trans_b, out, flag);
-  } else {
-    MatMul<Context, T>(
-        dev_ctx,
-        is_fold_init_dims_a ? FoldInitDims(a)
-                            : FoldHeadAndLastDims<Context, T>(dev_ctx, a),
-        trans_a,
-        is_fold_init_dims_b ? FoldInitDims(b)
-                            : FoldHeadAndLastDims<Context, T>(dev_ctx, b),
-        trans_b,
-        out,
-        flag);
-  }
-}
-
-template <typename T, typename Context>
-void MatmulGradKernel(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& y,
-                      const DenseTensor& out_grad,
-                      bool transpose_x,
-                      bool transpose_y,
-                      DenseTensor* dx,
-                      DenseTensor* dy) {
-  // get dims
-  std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
-  std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
-  std::vector<std::int64_t> dout_dims = common::vectorize(out_grad.dims());
-
-  int x_ndim = x_dims.size();
-  int y_ndim = y_dims.size();
-  int ndim = dout_dims.size();
-
-  // Case1 : x's or y's dim = 1
-  if (x_ndim == 1 && y_ndim == 1) {
-    if (dx) dev_ctx.template Alloc<T>(dx);
-    if (dy) dev_ctx.template Alloc<T>(dy);
-    if (out_grad.numel() == 1) {
-      DotGradFunction<Context, T>()(dev_ctx, &x, &y, &out_grad, dx, dy);
-      return;
-    }
-  }
-
-  bool is_broadcast = true;
-  if (x_ndim <= 2 || y_ndim <= 2) {
-    is_broadcast = false;
-  } else if (x_ndim != y_ndim) {
-    is_broadcast = true;
-  } else {
-    is_broadcast = !std::equal(
-        x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2, y_dims.cbegin());
-  }
-
-  // for complex
-  DenseTensor x_conj;
-  DenseTensor y_conj;
-
-  // Case2: no broadcast or no batch size, it aims to speed and it is same as
-  // matmul in old version.
-  if (!is_broadcast) {
-    DenseTensor x_help = x;
-    DenseTensor y_help = y;
-    DenseTensor out_grad_help = out_grad;
-
-    ReshapeXYOutIntoMatrixSequence(
-        &x_help, &y_help, &out_grad_help, transpose_x, transpose_y);
-
-    DDim dx_dims;
-    if (dx) {
-      dx_dims = dx->dims();
-      if (dx_dims != x_help.dims()) {
-        dx->Resize(x_help.dims());
-      }
-
-      y_conj = Conj<T>(dev_ctx, y_help);
-    }
-
-    DDim dy_dims;
-    if (dy) {
-      dy_dims = dy->dims();
-      if (dy_dims != y_help.dims()) {
-        dy->Resize(y_help.dims());
-      }
-
-      x_conj = Conj<T>(dev_ctx, x_help);
-    }
-
-    if (transpose_x && transpose_y) {
-      CalcInputGrad<T>(
-          dev_ctx, y_conj, true, true, out_grad_help, true, false, dx);
-      CalcInputGrad<T>(
-          dev_ctx, out_grad_help, true, true, x_conj, true, false, dy);
-    } else if (transpose_x) {
-      CalcInputGrad<T>(
-          dev_ctx, y_conj, false, false, out_grad_help, true, false, dx);
-      CalcInputGrad<T>(
-          dev_ctx, x_conj, false, false, out_grad_help, false, true, dy);
-    } else if (transpose_y) {
-      CalcInputGrad<T>(
-          dev_ctx, out_grad_help, false, false, y_conj, false, true, dx);
-      CalcInputGrad<T>(
-          dev_ctx, out_grad_help, true, true, x_conj, false, true, dy);
-    } else {
-      CalcInputGrad<T>(
-          dev_ctx, out_grad_help, false, false, y_conj, true, false, dx);
-      CalcInputGrad<T>(
-          dev_ctx, x_conj, true, true, out_grad_help, false, true, dy);
-    }
-
-    if (dx) {
-      if (dx_dims != x_help.dims()) {
-        dx->Resize(dx_dims);
-      }
-    }
-    if (dy) {
-      if (dy_dims != y_help.dims()) {
-        dy->Resize(dy_dims);
-      }
-    }
-  } else {
-    // Case3: broadcast. It need cost much time to reduce sum for the
-    // broadcast and wastes the memory.
-    // So we should avoid the case in reality.
-    VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
-               "wastes the memory. So we should avoid the case in reality";
-    x_conj = Conj<T>(dev_ctx, x);
-    y_conj = Conj<T>(dev_ctx, y);
-
-    DenseTensor dx_help;
-    DenseTensor dy_help;
-
-    if (transpose_x) {
-      if (transpose_y) {
-        // X'Y': dA = Y'G', dB = G'X'
-        if (dx)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     y_conj,
-                                     out_grad,
-                                     y_dims,
-                                     dout_dims,
-                                     &dx_help,
-                                     true,
-                                     true);
-        if (dy)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     out_grad,
-                                     x_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &dy_help,
-                                     true,
-                                     true);
-      } else {
-        // X'Y: dX = YG', dY = XG
-        if (dx)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     y_conj,
-                                     out_grad,
-                                     y_dims,
-                                     dout_dims,
-                                     &dx_help,
-                                     false,
-                                     true);
-        if (dy)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     x_conj,
-                                     out_grad,
-                                     x_dims,
-                                     dout_dims,
-                                     &dy_help,
-                                     false,
-                                     false);
-      }
-    } else {
-      if (transpose_y) {
-        // XY': dX = GY, dY = G'X
-        if (dx)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     out_grad,
-                                     y_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &dx_help,
-                                     false,
-                                     false);
-        if (dy)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     out_grad,
-                                     x_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &dy_help,
-                                     true,
-                                     false);
-      } else {
-        // XY: dX = GY', dY = X'G
-        if (dx)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     out_grad,
-                                     y_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &dx_help,
-                                     false,
-                                     true);
-        if (dy)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     x_conj,
-                                     out_grad,
-                                     x_dims,
-                                     dout_dims,
-                                     &dy_help,
-                                     true,
-                                     false);
-      }
-    }
-
-    // get help dims
-    const std::vector<std::int64_t> dx_help_dims =
-        common::vectorize(dx_help.dims());
-    const std::vector<std::int64_t> dy_help_dims =
-        common::vectorize(dy_help.dims());
-
-    std::vector<std::int64_t> dx_broadcast_dims(ndim);
-    std::vector<std::int64_t> dy_broadcast_dims(ndim);
-
-    std::fill(
-        dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1);
-    std::fill(
-        dy_broadcast_dims.data(), dy_broadcast_dims.data() + ndim - y_ndim, 1);
-    std::copy(x_dims.data(),
-              x_dims.data() + x_ndim,
-              dx_broadcast_dims.data() + ndim - x_ndim);
-    std::copy(y_dims.data(),
-              y_dims.data() + y_ndim,
-              dy_broadcast_dims.data() + ndim - y_ndim);
-
-    std::vector<int> dx_reduce_dims;
-    std::vector<int> dy_reduce_dims;
-    for (int idx = 0; idx <= ndim - 3; idx++) {
-      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
-        dx_reduce_dims.push_back(idx);
-      }
-      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
-        dy_reduce_dims.push_back(idx);
-      }
-    }
-    // reduce sum to get grad by ReduceSum
-    if (dx) {
-      if (dx_reduce_dims.empty()) {
-        *dx = std::move(dx_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, dx_help, dx, dx_reduce_dims);
-      }
-      dx->Resize(x.dims());
-    }
-    if (dy) {
-      if (dy_reduce_dims.empty()) {
-        *dy = std::move(dy_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, dy_help, dy, dy_reduce_dims);
-      }
-      dy->Resize(y.dims());
-    }
-    // Get the OutputGrad(out)
-  }
-}
-
-template <typename T, typename Context>
-void MatmulDoubleGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            const DenseTensor& dout,
-                            const paddle::optional<DenseTensor>& ddx,
-                            const paddle::optional<DenseTensor>& ddy,
-                            bool transpose_x,
-                            bool transpose_y,
-                            DenseTensor* dx,
-                            DenseTensor* dy,
-                            DenseTensor* ddout) {
-  // Get dims from the input x, y, output_grad
-  std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
-  std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
-  std::vector<std::int64_t> dout_dims = common::vectorize(dout.dims());
-
-  int x_ndim = x_dims.size();
-  int y_ndim = y_dims.size();
-  int ndim = dout_dims.size();
-
-  // Case1 : x's or y's dim = 1
-  if (x_ndim == 1 && y_ndim == 1) {
-    DotDoubleGradFunction<Context, T>()(
-        dev_ctx, &x, &y, &dout, &ddx, &ddy, dx, dy, ddout);
-    return;
-  }
-
-  DenseTensor x_conj;
-  DenseTensor y_conj;
-  DenseTensor dout_conj;
-
-  bool is_broadcast = true;
-  if (x_ndim <= 2 || y_ndim <= 2) {
-    is_broadcast = false;
-  } else if (x_ndim != y_ndim) {
-    is_broadcast = true;
-  } else {
-    is_broadcast = !std::equal(
-        x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2, y_dims.cbegin());
-  }
-
-  if (!is_broadcast) {
-    // Case2: no broadcast or no batch size
-    DenseTensor x_help = x;
-    DenseTensor y_help = y;
-    DenseTensor dout_help = dout;
-    ReshapeXYOutIntoMatrixSequence(
-        &x_help, &y_help, &dout_help, transpose_x, transpose_y);
-    DDim dx_dims;
-
-    if (dx) {
-      dx_dims = dx->dims();
-      if (dx_dims != x_help.dims()) {
-        dx->Resize(x_help.dims());
-      }
-    }
-
-    DDim dy_dims;
-    if (dy) {
-      dy_dims = dy->dims();
-      if (dy_dims != y_help.dims()) {
-        dy->Resize(y_help.dims());
-      }
-    }
-
-    DDim ddout_dims;
-    if (ddout) {
-      ddout_dims = ddout->dims();
-      if (ddout_dims != dout_help.dims()) {
-        ddout->Resize(dout_help.dims());
-      }
-
-      x_conj = Conj<T>(dev_ctx, x_help);
-      y_conj = Conj<T>(dev_ctx, y_help);
-    }
-
-    if (dx || dy) {
-      dout_conj = Conj<T>(dev_ctx, dout_help);
-    }
-
-    bool ddout_flag = false;
-    if (ddx) {
-      auto ddx_mat = ddx.get();
-      if (ddx_mat.dims() != x_help.dims()) {
-        ddx_mat.Resize(x_help.dims());
-      }
-      if (dy) {
-        if (transpose_x && transpose_y) {
-          // dy = dout' * ddx'
-          CalcInputGrad<T>(
-              dev_ctx, dout_conj, true, true, ddx_mat, true, false, dy, false);
-        } else if (transpose_x) {
-          // dy = ddx * dout
-          CalcInputGrad<T>(dev_ctx,
-                           ddx_mat,
-                           false,
-                           false,
-                           dout_conj,
-                           false,
-                           true,
-                           dy,
-                           false);
-        } else if (transpose_y) {
-          // dy = dout' * ddx
-          CalcInputGrad<T>(
-              dev_ctx, dout_conj, true, true, ddx_mat, false, true, dy, false);
-        } else {
-          // dy = ddx' * dout
-          CalcInputGrad<T>(
-              dev_ctx, ddx_mat, true, true, dout_conj, false, true, dy, false);
-        }
-      }
-
-      if (ddout) {
-        CalcInputGrad<T>(dev_ctx,
-                         ddx_mat,
-                         transpose_x,
-                         true,
-                         y_conj,
-                         transpose_y,
-                         false,
-                         ddout,
-                         ddout_flag);
-        ddout_flag = true;
-      }
-    } else if (!ddx && dy) {
-      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
-    }
-    if (ddy) {
-      auto ddy_mat = ddy.get();
-      if (ddy_mat.dims() != y_help.dims()) {
-        ddy_mat.Resize(y_help.dims());
-      }
-      if (dx) {
-        if (transpose_x && transpose_y) {
-          // dx = ddy' * dout'
-          CalcInputGrad<T>(
-              dev_ctx, ddy_mat, true, true, dout_conj, true, false, dx, false);
-        } else if (transpose_x) {
-          // dx = ddy * dout'
-          CalcInputGrad<T>(dev_ctx,
-                           ddy_mat,
-                           false,
-                           false,
-                           dout_conj,
-                           true,
-                           false,
-                           dx,
-                           false);
-        } else if (transpose_y) {
-          // dx = dout * ddy
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           false,
-                           false,
-                           ddy_mat,
-                           false,
-                           true,
-                           dx,
-                           false);
-        } else {
-          // dx = dout * ddy'
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           false,
-                           false,
-                           ddy_mat,
-                           true,
-                           false,
-                           dx,
-                           false);
-        }
-      }
-
-      if (ddout) {
-        CalcInputGrad<T>(dev_ctx,
-                         x_conj,
-                         transpose_x,
-                         true,
-                         ddy_mat,
-                         transpose_y,
-                         false,
-                         ddout,
-                         ddout_flag);
-      }
-    } else if (!ddy && dx) {
-      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), dx);
-    }
-    if (ddout && !ddx && !ddy) {
-      FullLikeKernel<T, Context>(
-          dev_ctx, dout, Scalar(0.0), dout.dtype(), ddout);
-    }
-
-    if (dx) {
-      if (dx_dims != x_help.dims()) {
-        dx->Resize(dx_dims);
-      }
-    }
-
-    if (dy) {
-      if (dy_dims != y_help.dims()) {
-        dy->Resize(dy_dims);
-      }
-    }
-
-    if (ddout) {
-      if (ddout_dims != dout_help.dims()) {
-        ddout->Resize(ddout_dims);
-      }
-    }
-  } else {
-    // Case3: broadcast. It need cost much time to reduce sum for the
-    // broadcast and wastes the memory.
-    // So we should avoid the case in reality.
-    VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
-               "wastes the memory. So we should avoid the case in reality";
-    if (dx || dy) {
-      dout_conj = Conj<T>(dev_ctx, dout);
-    }
-    if (ddout) {
-      x_conj = Conj<T>(dev_ctx, x);
-      y_conj = Conj<T>(dev_ctx, y);
-    }
-
-    DenseTensor dx_help;
-    DenseTensor dy_help;
-
-    if (transpose_x) {
-      if (transpose_y) {
-        if (dx && ddy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddy.get(),
-                                     dout_conj,
-                                     y_dims,
-                                     dout_dims,
-                                     &dx_help,
-                                     true,
-                                     true);
-        }
-        if (dy && ddx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     ddx.get(),
-                                     dout_dims,
-                                     x_dims,
-                                     &dy_help,
-                                     true,
-                                     true);
-        }
-      } else {
-        if (dx && ddy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddy.get(),
-                                     dout_conj,
-                                     y_dims,
-                                     dout_dims,
-                                     &dx_help,
-                                     false,
-                                     true);
-        }
-        if (dy && ddx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddx.get(),
-                                     dout_conj,
-                                     x_dims,
-                                     dout_dims,
-                                     &dy_help,
-                                     false,
-                                     false);
-        }
-      }
-    } else {
-      if (transpose_y) {
-        if (dx && ddy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     ddy.get(),
-                                     dout_dims,
-                                     y_dims,
-                                     &dx_help,
-                                     false,
-                                     false);
-        }
-        if (dy && ddx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     ddx.get(),
-                                     dout_dims,
-                                     x_dims,
-                                     &dy_help,
-                                     true,
-                                     false);
-        }
-      } else {
-        if (dx && ddy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     ddy.get(),
-                                     dout_dims,
-                                     y_dims,
-                                     &dx_help,
-                                     false,
-                                     true);
-        }
-        if (dy && ddx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddx.get(),
-                                     dout_conj,
-                                     x_dims,
-                                     dout_dims,
-                                     &dy_help,
-                                     true,
-                                     false);
-        }
-      }
-    }
-
-    // get help dims
-    const std::vector<std::int64_t> dx_help_dims =
-        common::vectorize(dx_help.dims());
-    const std::vector<std::int64_t> dy_help_dims =
-        common::vectorize(dy_help.dims());
-
-    std::vector<std::int64_t> dx_broadcast_dims(ndim);
-    std::vector<std::int64_t> dy_broadcast_dims(ndim);
-
-    std::fill(
-        dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1);
-    std::fill(
-        dy_broadcast_dims.data(), dy_broadcast_dims.data() + ndim - y_ndim, 1);
-    std::copy(x_dims.data(),
-              x_dims.data() + x_ndim,
-              dx_broadcast_dims.data() + ndim - x_ndim);
-    std::copy(y_dims.data(),
-              y_dims.data() + y_ndim,
-              dy_broadcast_dims.data() + ndim - y_ndim);
-
-    std::vector<int> dx_reduce_dims;
-    std::vector<int> dy_reduce_dims;
-    for (int idx = 0; idx <= ndim - 3; idx++) {
-      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
-        dx_reduce_dims.push_back(idx);
-      }
-      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
-        dy_reduce_dims.push_back(idx);
-      }
-    }
-    // Reduce sum to get grad by ReduceSum
-    if (dx && dx_help.initialized()) {
-      if (dx_reduce_dims.empty()) {
-        *dx = std::move(dx_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, dx_help, dx, dx_reduce_dims);
-      }
-      dx->Resize(x.dims());
-    } else if (dx && !dx_help.initialized()) {
-      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), dx);
-    }
-    if (dy && dy_help.initialized()) {
-      if (dy_reduce_dims.empty()) {
-        *dy = std::move(dy_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, dy_help, dy, dy_reduce_dims);
-      }
-      dy->Resize(y.dims());
-    } else if (dy && !dy_help.initialized()) {
-      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), dy);
-    }
-
-    if (ddout) {
-      // Calculate the gradient of OutputGrad(Out)
-      if (ddx) {
-        MatMulFunction<Context, T>(dev_ctx,
-                                   ddx.get(),
-                                   y_conj,
-                                   x_dims,
-                                   y_dims,
-                                   ddout,
-                                   transpose_x,
-                                   transpose_y);
-      }
-
-      if (ddy) {
-        MatMulFunction<Context, T>(dev_ctx,
-                                   x_conj,
-                                   ddy.get(),
-                                   x_dims,
-                                   y_dims,
-                                   ddout,
-                                   transpose_x,
-                                   transpose_y,
-                                   true);
-      }
-    }
-  }
-}
-
-template <typename T, typename Context>
-void MatmulTripleGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            const DenseTensor& dout,
-                            const paddle::optional<DenseTensor>& ddx,
-                            const paddle::optional<DenseTensor>& ddy,
-                            const paddle::optional<DenseTensor>& d_dx,
-                            const paddle::optional<DenseTensor>& d_dy,
-                            const paddle::optional<DenseTensor>& d_ddout,
-                            bool transpose_x,
-                            bool transpose_y,
-                            DenseTensor* out_d_x,
-                            DenseTensor* out_d_y,
-                            DenseTensor* out_d_dout,
-                            DenseTensor* out_d_ddx,
-                            DenseTensor* out_d_ddy) {
-  // Get dims from the input x, y, output_grad
-  std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
-  std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
-  std::vector<std::int64_t> dout_dims = common::vectorize(dout.dims());
-
-  int x_ndim = x_dims.size();
-  int y_ndim = y_dims.size();
-  int ndim = dout_dims.size();
-
-  // Case1 : x's and y's dim = 1
-  if (x_ndim == 1 && y_ndim == 1) {
-    VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 1";
-    DotTripleGradFunction<Context, T>()(dev_ctx,
-                                        &x,
-                                        &y,
-                                        &dout,
-                                        &ddx,
-                                        &ddy,
-                                        &d_dx,
-                                        &d_dy,
-                                        &d_ddout,
-                                        out_d_x,
-                                        out_d_y,
-                                        out_d_dout,
-                                        out_d_ddx,
-                                        out_d_ddy);
-    return;
-  }
-
-  DenseTensor x_conj;
-  DenseTensor y_conj;
-  DenseTensor dout_conj;
-  DenseTensor ddx_conj;
-  DenseTensor ddy_conj;
-
-  bool is_broadcast = true;
-  if (x_ndim <= 2 || y_ndim <= 2) {
-    is_broadcast = false;
-  } else if (x_ndim != y_ndim) {
-    is_broadcast = true;
-  } else {
-    is_broadcast = !std::equal(
-        x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2, y_dims.cbegin());
-  }
-
-  if (!is_broadcast) {
-    // Case2: no broadcast or no batch size
-    VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 2";
-    DenseTensor x_help = x;
-    DenseTensor y_help = y;
-    DenseTensor dout_help = dout;
-
-    DenseTensor ddx_help;
-    DenseTensor ddy_help;
-    ReshapeXYOutIntoMatrixSequence(
-        &x_help, &y_help, &dout_help, transpose_x, transpose_y);
-    if (ddx) {
-      ddx_help = ddx.get();
-      if (ddx_help.dims() != x_help.dims()) {
-        ddx_help.Resize(x_help.dims());
-      }
-    }
-
-    if (ddy) {
-      ddy_help = ddy.get();
-      if (ddy_help.dims() != y_help.dims()) {
-        ddy_help.Resize(y_help.dims());
-      }
-    }
-
-    DDim out_dx_dims;
-    if (out_d_x) {
-      out_dx_dims = out_d_x->dims();
-      if (out_dx_dims != x_help.dims()) {
-        out_d_x->Resize(x_help.dims());
-      }
-      if (ddy) {
-        ddy_conj = Conj<T>(dev_ctx, ddy_help);
-      }
-    }
-    DDim out_dy_dims;
-    if (out_d_y) {
-      out_dy_dims = out_d_y->dims();
-      if (out_dy_dims != y_help.dims()) {
-        out_d_y->Resize(y_help.dims());
-      }
-      if (ddx) {
-        ddx_conj = Conj<T>(dev_ctx, ddx_help);
-      }
-    }
-    DDim out_d_dout_dims;
-    if (out_d_dout) {
-      out_d_dout_dims = out_d_dout->dims();
-      if (out_d_dout_dims != dout_help.dims()) {
-        out_d_dout->Resize(dout_help.dims());
-      }
-      if (ddx && !ddx_conj.IsInitialized()) {
-        ddx_conj = Conj<T>(dev_ctx, ddx_help);
-      }
-      if (ddy && !ddy_conj.IsInitialized()) {
-        ddy_conj = Conj<T>(dev_ctx, ddy_help);
-      }
-    }
-    DDim out_d_ddx_dims;
-    if (out_d_ddx) {
-      out_d_ddx_dims = out_d_ddx->dims();
-      if (out_d_ddx_dims != x_help.dims()) {
-        out_d_ddx->Resize(x_help.dims());
-      }
-      dout_conj = Conj<T>(dev_ctx, dout_help);
-      y_conj = Conj<T>(dev_ctx, y_help);
-    }
-    DDim out_d_ddy_dims;
-    if (out_d_ddy) {
-      out_d_ddy_dims = out_d_ddy->dims();
-      if (out_d_ddy_dims != y_help.dims()) {
-        out_d_ddy->Resize(y_help.dims());
-      }
-      if (!dout_conj.IsInitialized()) {
-        dout_conj = Conj<T>(dev_ctx, dout_help);
-      }
-      x_conj = Conj<T>(dev_ctx, x_help);
-    }
-
-    bool d_dout_flag = false;
-    bool d_ddx_flag = false;
-    bool d_ddy_flag = false;
-    if (d_ddout) {
-      auto d_ddout_mat = d_ddout.get();
-      if (d_ddout_mat.dims() != dout_help.dims()) {
-        d_ddout_mat.Resize(dout_help.dims());
-      }
-
-      if (out_d_y && ddx) {
-        if (transpose_x && transpose_y) {
-          // out_d_y = d_ddout' * ddx'
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           true,
-                           true,
-                           ddx_conj,
-                           true,
-                           false,
-                           out_d_y,
-                           false);
-        } else if (transpose_x) {
-          // out_d_y = ddx * d_ddout
-          CalcInputGrad<T>(dev_ctx,
-                           ddx_conj,
-                           false,
-                           false,
-                           d_ddout_mat,
-                           false,
-                           true,
-                           out_d_y,
-                           false);
-        } else if (transpose_y) {
-          // out_d_y = d_ddout' * ddx
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           true,
-                           true,
-                           ddx_conj,
-                           false,
-                           true,
-                           out_d_y,
-                           false);
-        } else {
-          // out_d_y = ddx' * d_ddout
-          CalcInputGrad<T>(dev_ctx,
-                           ddx_conj,
-                           true,
-                           true,
-                           d_ddout_mat,
-                           false,
-                           true,
-                           out_d_y,
-                           false);
-        }
-      } else if (out_d_y) {
-        FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
-      }
-      if (out_d_x && ddy) {
-        if (transpose_x && transpose_y) {
-          // out_d_x = ddy' * d_ddout'
-          CalcInputGrad<T>(dev_ctx,
-                           ddy_conj,
-                           true,
-                           true,
-                           d_ddout_mat,
-                           true,
-                           false,
-                           out_d_x,
-                           false);
-        } else if (transpose_x) {
-          // out_d_x = ddy * d_ddout'
-          CalcInputGrad<T>(dev_ctx,
-                           ddy_conj,
-                           false,
-                           false,
-                           d_ddout_mat,
-                           true,
-                           false,
-                           out_d_x,
-                           false);
-        } else if (transpose_y) {
-          // out_d_x = d_ddout * ddy
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           false,
-                           false,
-                           ddy_conj,
-                           false,
-                           true,
-                           out_d_x,
-                           false);
-        } else {
-          // out_d_x = d_ddout * ddy'
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           false,
-                           false,
-                           ddy_conj,
-                           true,
-                           false,
-                           out_d_x,
-                           false);
-        }
-      } else if (out_d_x) {
-        FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
-      }
-
-      // equations:
-      // d_ddx = DOut * D_DY + Y * D_DDOut
-      // Let: d_ddx1 = Y * D_DDOut
-      // Let: d_ddx2 = DOut * D_DY
-
-      // d_ddy = DOut * D_DX + X * D_DDOut
-      // Let: d_ddy1 = X * D_DDOut
-      // Let: d_ddy2 = DOut * D_DX
-
-      // d_dout = DDY * D_DX + DDX * D_DY
-      // Let: d_dout1 = DDX * D_DY
-      // Let: d_dout2 = DDY * D_DX
-
-      // compute d_ddx1
-      if (out_d_ddx) {
-        if (transpose_x && transpose_y) {
-          // out_d_ddx1 = y' * d_ddout'
-          CalcInputGrad<T>(dev_ctx,
-                           y_conj,
-                           true,
-                           true,
-                           d_ddout_mat,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else if (transpose_x) {
-          // out_d_ddx1 = y * d_ddout'
-          CalcInputGrad<T>(dev_ctx,
-                           y_conj,
-                           false,
-                           false,
-                           d_ddout_mat,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else if (transpose_y) {
-          // out_d_ddx1 = d_ddout * y
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           false,
-                           false,
-                           y_conj,
-                           false,
-                           true,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else {
-          // out_d_ddx1 = d_ddout * y'
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           false,
-                           false,
-                           y_conj,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        }
-        d_ddx_flag = true;
-      }
-
-      // compute d_ddy1
-      if (out_d_ddy) {
-        if (transpose_x && transpose_y) {
-          // out_d_ddy1 = d_ddout' * x'
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           true,
-                           true,
-                           x_conj,
-                           true,
-                           false,
-                           out_d_ddy,
-                           false);
-        } else if (transpose_x) {
-          // out_d_ddy1 = x * d_ddout
-          CalcInputGrad<T>(dev_ctx,
-                           x_conj,
-                           false,
-                           false,
-                           d_ddout_mat,
-                           false,
-                           true,
-                           out_d_ddy,
-                           false);
-        } else if (transpose_y) {
-          // out_d_ddy1 = d_ddout' * x
-          CalcInputGrad<T>(dev_ctx,
-                           d_ddout_mat,
-                           true,
-                           true,
-                           x_conj,
-                           false,
-                           true,
-                           out_d_ddy,
-                           false);
-        } else {
-          // out_d_ddy1 = x' * d_ddout
-          CalcInputGrad<T>(dev_ctx,
-                           x_conj,
-                           true,
-                           true,
-                           d_ddout_mat,
-                           false,
-                           true,
-                           out_d_ddy,
-                           false);
-        }
-        d_ddy_flag = true;
-      }
-    } else {
-      // d_ddout is none
-      if (out_d_x) {
-        FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
-      }
-
-      if (out_d_y) {
-        FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
-      }
-    }
-
-    if (d_dy) {
-      auto d_dy_mat = d_dy.get();
-      if (d_dy_mat.dims() != y_help.dims()) {
-        d_dy_mat.Resize(y_help.dims());
-      }
-
-      // compute d_dout1
-      if (out_d_dout && ddx) {
-        CalcInputGrad<T>(dev_ctx,
-                         ddx_conj,
-                         transpose_x,
-                         true,
-                         d_dy_mat,
-                         transpose_y,
-                         false,
-                         out_d_dout,
-                         d_dout_flag);
-        d_dout_flag = true;
-      }
-
-      // compute d_ddx2
-      if (out_d_ddx) {
-        if (transpose_x && transpose_y) {
-          // out_d_ddx2 = D_DY' * DOut'
-          CalcInputGrad<T>(dev_ctx,
-                           d_dy_mat,
-                           true,
-                           true,
-                           dout_conj,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else if (transpose_x) {
-          // out_d_ddx2 = D_DY * Dout'
-          CalcInputGrad<T>(dev_ctx,
-                           d_dy_mat,
-                           false,
-                           false,
-                           dout_conj,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else if (transpose_y) {
-          // out_d_ddx2 = Dout * D_DY
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           false,
-                           false,
-                           d_dy_mat,
-                           false,
-                           true,
-                           out_d_ddx,
-                           d_ddx_flag);
-        } else {
-          // out_d_ddx2 = Dout * D_DY'
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           false,
-                           false,
-                           d_dy_mat,
-                           true,
-                           false,
-                           out_d_ddx,
-                           d_ddx_flag);
-        }
-      }
-    }
-
-    if (d_dx) {
-      auto d_dx_mat = d_dx.get();
-      if (d_dx_mat.dims() != x_help.dims()) {
-        d_dx_mat.Resize(x_help.dims());
-      }
-
-      // compute d_dout2
-      if (out_d_dout && ddy) {
-        CalcInputGrad<T>(dev_ctx,
-                         d_dx_mat,
-                         transpose_x,
-                         true,
-                         ddy_conj,
-                         transpose_y,
-                         false,
-                         out_d_dout,
-                         d_dout_flag);
-      }
-
-      // compute d_ddy2
-      if (out_d_ddy) {
-        if (transpose_x && transpose_y) {
-          // out_d_ddy2 = dout' * d_dx'
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           true,
-                           true,
-                           d_dx_mat,
-                           true,
-                           false,
-                           out_d_ddy,
-                           d_ddy_flag);
-        } else if (transpose_x) {
-          // out_d_ddy2 = d_dx * dout
-          CalcInputGrad<T>(dev_ctx,
-                           d_dx_mat,
-                           false,
-                           false,
-                           dout_conj,
-                           false,
-                           true,
-                           out_d_ddy,
-                           d_ddy_flag);
-        } else if (transpose_y) {
-          // out_d_ddy2 = dout' * d_dx
-          CalcInputGrad<T>(dev_ctx,
-                           dout_conj,
-                           true,
-                           true,
-                           d_dx_mat,
-                           false,
-                           true,
-                           out_d_ddy,
-                           d_ddy_flag);
-        } else {
-          // out_d_ddy2 = d_dx' * dout
-          CalcInputGrad<T>(dev_ctx,
-                           d_dx_mat,
-                           true,
-                           true,
-                           dout_conj,
-                           false,
-                           true,
-                           out_d_ddy,
-                           d_ddy_flag);
-        }
-      }
-    }
-
-    if (out_d_x) {
-      if (out_dx_dims != x_help.dims()) {
-        out_d_x->Resize(out_dx_dims);
-      }
-    }
-
-    if (out_d_y) {
-      if (out_dy_dims != y_help.dims()) {
-        out_d_y->Resize(out_dy_dims);
-      }
-    }
-
-    if (out_d_dout) {
-      if (out_d_dout_dims != dout_help.dims()) {
-        out_d_dout->Resize(out_d_dout_dims);
-      }
-    }
-
-    if (out_d_ddx) {
-      if (out_d_ddx_dims != x_help.dims()) {
-        out_d_ddx->Resize(out_d_ddx_dims);
-      }
-    }
-
-    if (out_d_ddy) {
-      if (out_d_ddy_dims != y_help.dims()) {
-        out_d_ddy->Resize(out_d_ddy_dims);
-      }
-    }
-
-    if (out_d_dout && !out_d_dout->IsInitialized()) {
-      FullLikeKernel<T, Context>(
-          dev_ctx, dout, Scalar(0.0), dout.dtype(), out_d_dout);
-    }
-
-    if (out_d_ddx && !out_d_ddx->IsInitialized()) {
-      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_ddx);
-    }
-
-    if (out_d_ddy && !out_d_ddy->IsInitialized()) {
-      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_ddy);
-    }
-  } else {
-    // Case3: broadcast. It need cost much time to reduce sum for the
-    // broadcast and wastes the memory.
-    // So we should avoid the case in reality.
-    VLOG(3) << "========  MatMulV2TripleGradKernel, Compute ====== Case 3";
-    VLOG(3) << "It need cost much time to reduce sum for the broadcast and "
-               "wastes the memory. So we should avoid the case in reality";
-
-    DenseTensor out_dx_help;
-    DenseTensor out_dy_help;
-    DenseTensor out_d_ddx_help;
-    DenseTensor out_d_ddy_help;
-
-    if (out_d_dout) {
-      if (ddx) {
-        ddx_conj = Conj<T>(dev_ctx, ddx.get());
-      }
-      if (ddy) {
-        ddy_conj = Conj<T>(dev_ctx, ddy.get());
-      }
-    }
-    if (out_d_ddx || out_d_ddy) {
-      x_conj = Conj<T>(dev_ctx, x);
-      y_conj = Conj<T>(dev_ctx, y);
-      dout_conj = Conj<T>(dev_ctx, dout);
-    }
-
-    if (transpose_x) {
-      if (transpose_y) {
-        // dX = ddY' d_ddout’, dY = d_ddout’ ddX'
-        if (out_d_x && ddy && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddy_conj,
-                                     d_ddout.get(),
-                                     y_dims,
-                                     dout_dims,
-                                     &out_dx_help,
-                                     true,
-                                     true);
-        if (out_d_y && ddx && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     ddx_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &out_dy_help,
-                                     true,
-                                     true);
-      } else {
-        // dX = ddY d_ddout', dY = ddX d_ddout
-        if (out_d_x && ddy && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddy_conj,
-                                     d_ddout.get(),
-                                     y_dims,
-                                     dout_dims,
-                                     &out_dx_help,
-                                     false,
-                                     true);
-        if (out_d_y && ddx && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddx_conj,
-                                     d_ddout.get(),
-                                     x_dims,
-                                     dout_dims,
-                                     &out_dy_help,
-                                     false,
-                                     false);
-      }
-
-    } else {
-      if (transpose_y) {
-        // dX = d_ddout ddY, dY = d_ddout’ ddX
-        if (out_d_x && ddy && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     ddy_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &out_dx_help,
-                                     false,
-                                     false);
-        if (out_d_y && ddx && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     ddx_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &out_dy_help,
-                                     true,
-                                     false);
-      } else {
-        // dX = d_ddout ddY', dY = ddX' d_ddout
-        if (out_d_x && ddy && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     ddy_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &out_dx_help,
-                                     false,
-                                     true);
-        if (out_d_y && ddx && d_ddout)
-          MatMulFunction<Context, T>(dev_ctx,
-                                     ddx_conj,
-                                     d_ddout.get(),
-                                     x_dims,
-                                     dout_dims,
-                                     &out_dy_help,
-                                     true,
-                                     false);
-      }
-    }
-
-    // get help dims
-    const std::vector<std::int64_t> dx_help_dims =
-        common::vectorize(out_dx_help.dims());
-    const std::vector<std::int64_t> dy_help_dims =
-        common::vectorize(out_dx_help.dims());
-
-    std::vector<std::int64_t> dx_broadcast_dims(ndim);
-    std::vector<std::int64_t> dy_broadcast_dims(ndim);
-
-    std::fill(
-        dx_broadcast_dims.data(), dx_broadcast_dims.data() + ndim - x_ndim, 1);
-    std::fill(
-        dy_broadcast_dims.data(), dy_broadcast_dims.data() + ndim - y_ndim, 1);
-    std::copy(x_dims.data(),
-              x_dims.data() + x_ndim,
-              dx_broadcast_dims.data() + ndim - x_ndim);
-    std::copy(y_dims.data(),
-              y_dims.data() + y_ndim,
-              dy_broadcast_dims.data() + ndim - y_ndim);
-
-    std::vector<int> dx_reduce_dims;
-    std::vector<int> dy_reduce_dims;
-    for (int idx = 0; idx <= ndim - 3; idx++) {
-      if (dx_help_dims[idx] != 1 && dx_broadcast_dims[idx] == 1) {
-        dx_reduce_dims.push_back(idx);
-      }
-      if (dy_help_dims[idx] != 1 && dy_broadcast_dims[idx] == 1) {
-        dy_reduce_dims.push_back(idx);
-      }
-    }
-
-    // Reduce sum to get grad by ReduceSum
-    if (out_d_x && out_dx_help.initialized()) {
-      if (dx_reduce_dims.empty()) {
-        *out_d_x = std::move(out_dx_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, out_dx_help, out_d_x, dx_reduce_dims);
-      }
-      out_d_x->Resize(x.dims());
-    } else if (out_d_x) {
-      FullLikeKernel<T, Context>(dev_ctx, x, Scalar(0.0), x.dtype(), out_d_x);
-    }
-
-    if (out_d_y && out_dy_help.initialized()) {
-      if (dy_reduce_dims.empty()) {
-        *out_d_y = std::move(out_dy_help);
-      } else {
-        ReduceSumForMatmulGrad<Context, T>()(
-            dev_ctx, out_dy_help, out_d_y, dy_reduce_dims);
-      }
-      out_d_y->Resize(y.dims());
-    } else if (out_d_y) {
-      FullLikeKernel<T, Context>(dev_ctx, y, Scalar(0.0), y.dtype(), out_d_y);
-    }
-
-    // compute d_dout
-    if (out_d_dout) {
-      if (d_dx && ddy) {
-        MatMulFunction<Context, T>(dev_ctx,
-                                   d_dx.get(),
-                                   ddy_conj,
-                                   x_dims,
-                                   y_dims,
-                                   out_d_dout,
-                                   transpose_x,
-                                   transpose_y);
-      }
-      if (d_dy && ddx) {
-        MatMulFunction<Context, T>(dev_ctx,
-                                   ddx_conj,
-                                   d_dy.get(),
-                                   x_dims,
-                                   y_dims,
-                                   out_d_dout,
-                                   transpose_x,
-                                   transpose_y,
-                                   true);
-      }
-
-      if (!out_d_dout->initialized()) {
-        FullLikeKernel<T, Context>(
-            dev_ctx, dout, Scalar(0.0), dout.dtype(), out_d_dout);
-      }
-    }
-
-    // compute d_ddx
-    if (out_d_ddx) {
-      if (transpose_x && transpose_y) {
-        // out_d_ddx1 = y' * d_ddout'
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     y_conj,
-                                     d_ddout.get(),
-                                     y_dims,
-                                     dout_dims,
-                                     &out_d_ddx_help,
-                                     true,
-                                     true);
-        }
-
-        // out_d_ddx2 = D_DY' * DOut'
-        if (d_dy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_dy.get(),
-                                     dout_conj,
-                                     y_dims,
-                                     dout_dims,
-                                     &out_d_ddx_help,
-                                     true,
-                                     true,
-                                     true);
-        }
-
-      } else if (transpose_x) {
-        // out_d_ddx1 = y * d_ddout'
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     y_conj,
-                                     d_ddout.get(),
-                                     y_dims,
-                                     dout_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     true);
-        }
-
-        // out_d_ddx2 = D_DY * Dout'
-        if (d_dy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_dy.get(),
-                                     dout_conj,
-                                     y_dims,
-                                     dout_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     true,
-                                     true);
-        }
-
-      } else if (transpose_y) {
-        // out_d_ddx1 = d_ddout * y
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     y_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     false);
-        }
-
-        // out_d_ddx2 = Dout * D_DY
-        if (d_dy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     d_dy.get(),
-                                     dout_dims,
-                                     y_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     false,
-                                     true);
-        }
-      } else {
-        // out_d_ddx1 = d_ddout * y'
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     y_conj,
-                                     dout_dims,
-                                     y_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     true);
-        }
-
-        // out_d_ddx2 = Dout * D_DY'
-        if (d_dy) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     d_dy.get(),
-                                     dout_dims,
-                                     y_dims,
-                                     &out_d_ddx_help,
-                                     false,
-                                     true,
-                                     true);
-        }
-      }
-      if (out_d_ddx_help.initialized()) {
-        if (dx_reduce_dims.empty()) {
-          *out_d_ddx = std::move(out_d_ddx_help);
-        } else {
-          ReduceSumForMatmulGrad<Context, T>()(
-              dev_ctx, out_d_ddx_help, out_d_ddx, dx_reduce_dims);
-        }
-      } else {
-        FullLikeKernel<T, Context>(
-            dev_ctx, x, Scalar(0.0), x.dtype(), out_d_ddx);
-      }
-
-      out_d_ddx->Resize(x.dims());
-    }
-
-    // compute d_ddy
-    if (out_d_ddy) {
-      if (transpose_x && transpose_y) {
-        // out_d_ddy1 = d_ddout' * x'
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     x_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     true);
-        }
-
-        // out_d_ddy2 = dout' * d_dx'
-        if (d_dx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     d_dx.get(),
-                                     dout_dims,
-                                     x_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     true,
-                                     true);
-        }
-
-      } else if (transpose_x) {
-        // out_d_ddy1 = x * d_ddout
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     x_conj,
-                                     d_ddout.get(),
-                                     x_dims,
-                                     dout_dims,
-                                     &out_d_ddy_help,
-                                     false,
-                                     false);
-        }
-
-        // out_d_ddy2 = d_dx * dout
-        if (d_dx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_dx.get(),
-                                     dout_conj,
-                                     x_dims,
-                                     dout_dims,
-                                     &out_d_ddy_help,
-                                     false,
-                                     false,
-                                     true);
-        }
-
-      } else if (transpose_y) {
-        // out_d_ddy1 = d_ddout' * x
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_ddout.get(),
-                                     x_conj,
-                                     dout_dims,
-                                     x_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     false);
-        }
-
-        // out_d_ddy2 = dout' * d_dx
-        if (d_dx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     dout_conj,
-                                     d_dx.get(),
-                                     dout_dims,
-                                     x_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     false,
-                                     true);
-        }
-
-      } else {
-        // out_d_ddy1 = x' * d_ddout
-        if (d_ddout) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     x_conj,
-                                     d_ddout.get(),
-                                     x_dims,
-                                     dout_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     false);
-        }
-
-        // out_d_ddy2 = d_dx' * dout
-        if (d_dx) {
-          MatMulFunction<Context, T>(dev_ctx,
-                                     d_dx.get(),
-                                     dout_conj,
-                                     x_dims,
-                                     dout_dims,
-                                     &out_d_ddy_help,
-                                     true,
-                                     false,
-                                     true);
-        }
-      }
-
-      if (out_d_ddy_help.initialized()) {
-        if (dy_reduce_dims.empty()) {
-          *out_d_ddy = std::move(out_d_ddy_help);
-        } else {
-          ReduceSumForMatmulGrad<Context, T>()(
-              dev_ctx, out_d_ddy_help, out_d_ddy, dy_reduce_dims);
-        }
-      } else {
-        FullLikeKernel<T, Context>(
-            dev_ctx, y, Scalar(0.0), y.dtype(), out_d_ddy);
-      }
-
-      out_d_ddy->Resize(y.dims());
-    }
-  }
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenGradKernel(const Context& dev_ctx,
-                                 const DenseTensor& x,
-                                 const DenseTensor& y,
-                                 const DenseTensor& out_grad,
-                                 int x_num_col_dims,
-                                 int y_num_col_dims,
-                                 DenseTensor* x_grad,
-                                 DenseTensor* y_grad) {
-  auto x_matrix =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  auto y_matrix =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-  auto* dout = &out_grad;
-
-  DenseTensor dout_mat(*dout);
-  dout_mat.Resize({common::flatten_to_2d(x.dims(), x_num_col_dims)[0],
-                   common::flatten_to_2d(y.dims(), y_num_col_dims)[1]});
-
-  auto* dx = x_grad;
-  auto* dy = y_grad;
-
-  if (dx != nullptr) {
-    dx->set_lod(x.lod());
-  }
-  if (dy != nullptr) {
-    dy->set_lod(y.lod());
-  }
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  if (dx) {
-    dev_ctx.template Alloc<T>(dx);
-    DenseTensor dx_matrix =
-        dx->dims().size() > 2 ? phi::ReshapeToMatrix(*dx, x_num_col_dims) : *dx;
-
-    // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-    blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-  }
-  if (dy) {
-    dev_ctx.template Alloc<T>(dy);
-    DenseTensor dy_matrix =
-        dy->dims().size() > 2 ? phi::ReshapeToMatrix(*dy, y_num_col_dims) : *dy;
-    // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-    blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-  }
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenDoubleGradKernel(
-    const Context& dev_ctx,
-    const DenseTensor& x,
-    const DenseTensor& y,
-    const DenseTensor& out_grad,
-    const paddle::optional<DenseTensor>& x_grad_grad,
-    const paddle::optional<DenseTensor>& y_grad_grad,
-    int x_num_col_dims,
-    int y_num_col_dims,
-    DenseTensor* x_grad,
-    DenseTensor* y_grad,
-    DenseTensor* out_grad_grad) {
-  auto x_mat =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  auto y_mat =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-
-  const int m = common::flatten_to_2d(x.dims(), x_num_col_dims)[0];
-  const int n = common::flatten_to_2d(y.dims(), y_num_col_dims)[1];
-
-  auto* dout = &out_grad;
-  DenseTensor dout_mat(*dout);
-  dout_mat.Resize({m, n});
-
-  auto* ddx = x_grad_grad.get_ptr();
-  auto* ddy = y_grad_grad.get_ptr();
-
-  auto* dx = x_grad;
-  auto* dy = y_grad;
-  auto* ddout = out_grad_grad;
-
-  DenseTensor ddout_mat;
-  if (ddout) {
-    ddout->set_lod(dout->lod());
-    // allocate and reshape ddout
-    dev_ctx.template Alloc<T>(ddout);
-    ddout_mat.ShareDataWith(*ddout);
-    ddout_mat.Resize({m, n});
-  }
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-  // a flag to specify whether ddout value has been set, if flag
-  // is false, MatMul beta should be 0 to set ddout, if flag is
-  // true, MatMul beta should be 1 to add result to ddout.
-  bool ddout_flag = false;
-  if (ddx) {
-    auto ddx_mat = ddx->dims().size() > 2
-                       ? phi::ReshapeToMatrix(*ddx, x_num_col_dims)
-                       : static_cast<const DenseTensor&>(*ddx);
-
-    // dy = ddx' * dout. dy : K x M, ddx' : K x M, dout : M x N
-    if (dy) {
-      dy->set_lod(y.lod());
-      // allocate and reshape dy
-      dev_ctx.template Alloc<T>(dy);
-      DenseTensor dy_mat = dy->dims().size() > 2
-                               ? phi::ReshapeToMatrix(*dy, y_num_col_dims)
-                               : *dy;
-      blas.MatMul(ddx_mat, true, dout_mat, false, &dy_mat);
-    }
-    // ddout1 = ddx * y. ddx : M x K, y : K x N, ddout1 : M x N
-    if (ddout) {
-      blas.MatMul(ddx_mat,
-                  false,
-                  y_mat,
-                  false,
-                  static_cast<T>(1.0),
-                  &ddout_mat,
-                  static_cast<T>(ddout_flag));
-      ddout_flag = true;
-    }
-  }
-  if (ddy) {
-    auto ddy_mat = ddy->dims().size() > 2
-                       ? phi::ReshapeToMatrix(*ddy, y_num_col_dims)
-                       : static_cast<const DenseTensor&>(*ddy);
-    // dx = dout * ddy'. dout : M x N, ddy' : N x K, dx : M x K
-    if (dx) {
-      dx->set_lod(x.lod());
-      // allocate and reshape dx
-      dev_ctx.template Alloc<T>(dx);
-      DenseTensor dx_mat = dx->dims().size() > 2
-                               ? phi::ReshapeToMatrix(*dx, x_num_col_dims)
-                               : *dx;
-      blas.MatMul(dout_mat, false, ddy_mat, true, &dx_mat);
-    }
-    // ddout2 = x * ddy. x : M x K, ddy : K x N, ddout2 : M x N
-    if (ddout) {
-      blas.MatMul(x_mat,
-                  false,
-                  ddy_mat,
-                  false,
-                  static_cast<T>(1.0),
-                  &ddout_mat,
-                  static_cast<T>(ddout_flag));
-    }
-  }
-}
-template <typename T, typename Context>
-void LegacyMatmulGradKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            const DenseTensor& out_grad,
-                            bool transpose_x,
-                            bool transpose_y,
-                            float alpha,
-                            DenseTensor* dx,
-                            DenseTensor* dy) {
-  MatmulGradKernel<T, Context>(
-      dev_ctx, x, y, out_grad, transpose_x, transpose_y, dx, dy);
-  if (std::fabs(alpha - 1.f) > 1e-6f) {
-    ScaleKernel<T, Context>(dev_ctx, *dx, Scalar(alpha), Scalar(0), false, dx);
-    ScaleKernel<T, Context>(dev_ctx, *dy, Scalar(alpha), Scalar(0), false, dy);
-  }
-}
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
deleted file mode 100755
index 5221bd93ba9..00000000000
--- a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
+++ /dev/null
@@ -1,1717 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include "glog/logging.h"
-
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/autotune/cache_base.h"
-#include "paddle/phi/kernels/cast_kernel.h"
-#include "../funcs/blas/blas.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/phi/kernels/funcs/blas/blaslt_impl.hip.h"
-#else
-#include "../funcs/blas/blaslt_impl.cu.h"
-#endif
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/scale_kernel.h"
-#if defined(PADDLE_WITH_CUDA)
-// #include "paddle/phi/kernels/funcs/cublaslt.h"
-#include "paddle/phi/kernels/gpu/cuda_gemm_kernel.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
-#elif defined(PADDLE_WITH_HIP)
-#include "paddle/phi/kernels/funcs/hipblaslt.h"
-#endif
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-#include "paddle/phi/kernels/autotune/auto_tune_base.h"
-#endif
-#include "paddle/phi/kernels/full_kernel.h"
-// clang-format on
-namespace phi {
-
-static void GetBroadcastFromDims(const int x_ndim,
-                                 const std::int64_t* x_dims,
-                                 const int y_ndim,
-                                 const std::int64_t* y_dims,
-                                 std::int64_t* x_bd_dims,
-                                 std::int64_t* y_bd_dims,
-                                 std::int64_t* out_bd_dims) {
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::fill(x_bd_dims, x_bd_dims + ndim - x_ndim, 1);
-  std::fill(y_bd_dims, y_bd_dims + ndim - y_ndim, 1);
-  std::copy(x_dims, x_dims + x_ndim, x_bd_dims + ndim - x_ndim);
-  std::copy(y_dims, y_dims + y_ndim, y_bd_dims + ndim - y_ndim);
-
-  for (int i = 0; i < ndim; ++i) {
-    PADDLE_ENFORCE_EQ(
-        x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
-        true,
-        phi::errors::InvalidArgument(
-            "Input(X) and Input(Y) has error dim. "
-            "X_broadcast's shape[%s] must be equal to Y_broadcast's shape[%s], "
-            "or X_broadcast's shape[%s] <= 1, or Y_broadcast's shape[%s] <= 1, "
-            "but received X_broadcast's shape[%s] = [%s]"
-            "received Y_broadcast's shape[%s] = [%s].",
-            i,
-            i,
-            i,
-            i,
-            i,
-            x_bd_dims[i],
-            i,
-            y_bd_dims[i]));
-    if (x_bd_dims[i] == 0 || y_bd_dims[i] == 0) {
-      out_bd_dims[i] = 0;
-    } else {
-      out_bd_dims[i] = (std::max)(x_bd_dims[i], y_bd_dims[i]);
-    }
-  }
-}
-
-static int64_t GetIndexMessage(const int n,
-                               const int64_t* dims,
-                               const int64_t* index) {
-  int64_t sum = 0;
-  for (int i = 0; i < n; ++i) {
-    if (dims[i] > 1) {
-      sum = sum * dims[i] + index[i];
-    }
-  }
-  return sum;
-}
-
-static void IndexIncreaseFromDims(const int ndim,
-                                  const int64_t* dims,
-                                  int64_t* index) {
-  for (int i = ndim - 1; i >= 0; --i) {
-    ++index[i];
-    if (index[i] >= dims[i]) {
-      index[i] -= dims[i];
-    } else {
-      break;
-    }
-  }
-}
-
-// The general implementation with blas.
-template <typename Context, typename T>
-void MatMulFunctionImplWithBlas(
-    const Context& dev_ctx,
-    const DenseTensor& X,
-    const DenseTensor& Y,
-    const std::vector<std::int64_t>& x_dims,
-    const std::vector<std::int64_t>& y_dims,
-    DenseTensor* Out,
-    bool trans_x,
-    bool trans_y,
-    bool flag = false,
-    phi::funcs::MatmulPlanner* matmul_planner UNUSED = nullptr) {
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-
-  // Get data ptr
-  const T* x_data = X.data<T>();
-  const T* y_data = Y.data<T>();
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = X.numel();
-    const int N = Y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers, "
-            "when X/Y's dims =1. But received X has [%d] elements, "
-            "received Y has [%d] elements.",
-            M,
-            N));
-    VLOG(3) << "MatMul's case 1";
-    Out->Resize(common::make_ddim({}));
-    dev_ctx.template Alloc<T>(Out);
-    blas.GEMM(CblasNoTrans,
-              CblasTrans,
-              1,
-              1,
-              M,
-              static_cast<T>(1),
-              y_data,
-              x_data,
-              static_cast<T>(flag),
-              dev_ctx.template Alloc<T>(Out));
-    return;
-  }
-
-  if (x_ndim == 1) {
-    const int N = X.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-    if (trans_y) {
-      const int M = Y.numel() / N;
-      VLOG(3) << "MatMul's case 2";
-      blas.GEMV(false,
-                M,
-                N,
-                static_cast<T>(1),
-                y_data,
-                x_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = Y.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul's case 3";
-        blas.GEMV(true,
-                  N,
-                  M,
-                  static_cast<T>(1),
-                  y_data,
-                  x_data,
-                  static_cast<T>(flag),
-                  dev_ctx.template Alloc<T>(Out));
-      } else {
-        VLOG(3) << "MatMul's case 4";
-        blas.BatchedGEMM(CblasTrans,
-                         CblasNoTrans,
-                         M,
-                         1,
-                         N,
-                         static_cast<T>(1),
-                         y_data,
-                         x_data,
-                         static_cast<T>(flag),
-                         dev_ctx.template Alloc<T>(Out),
-                         batch_size,
-                         M * N,
-                         0);
-      }
-    }
-    return;
-  }
-
-  if (y_ndim == 1) {
-    const int N = Y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = X.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul's case 5";
-        blas.GEMV(true,
-                  N,
-                  M,
-                  static_cast<T>(1),
-                  x_data,
-                  y_data,
-                  static_cast<T>(flag),
-                  dev_ctx.template Alloc<T>(Out));
-      } else {
-        VLOG(3) << "MatMul's case 6";
-        blas.BatchedGEMM(CblasTrans,
-                         CblasNoTrans,
-                         M,
-                         1,
-                         N,
-                         static_cast<T>(1),
-                         x_data,
-                         y_data,
-                         static_cast<T>(flag),
-                         dev_ctx.template Alloc<T>(Out),
-                         batch_size,
-                         M * N,
-                         0);
-      }
-    } else {
-      const int M = X.numel() / N;
-      VLOG(3) << "MatMul's case 7";
-      blas.GEMV(false,
-                M,
-                N,
-                static_cast<T>(1),
-                x_data,
-                y_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    }
-    return;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  Out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  dev_ctx.template Alloc<T>(Out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return;
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    VLOG(3) << "MatMul's case 8";
-    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
-              trans_y ? CblasTrans : CblasNoTrans,
-              M,
-              N,
-              K,
-              static_cast<T>(1),
-              x_data,
-              y_data,
-              static_cast<T>(flag),
-              dev_ctx.template Alloc<T>(Out));
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      VLOG(3) << "MatMul's case 9";
-      blas.GEMV(false,
-                y_batch_size * N,
-                K,
-                static_cast<T>(1),
-                y_data,
-                x_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      VLOG(3) << "MatMul's case 10";
-      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                       trans_y ? CblasTrans : CblasNoTrans,
-                       M,
-                       N,
-                       K,
-                       static_cast<T>(1),
-                       x_data,
-                       y_data,
-                       static_cast<T>(flag),
-                       dev_ctx.template Alloc<T>(Out),
-                       out_batch_size,
-                       0,
-                       K * N);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      VLOG(3) << "MatMul's case 11";
-      blas.GEMM(CblasNoTrans,
-                trans_y ? CblasTrans : CblasNoTrans,
-                x_batch_size * M,
-                N,
-                K,
-                static_cast<T>(1),
-                x_data,
-                y_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      VLOG(3) << "MatMul's case 12";
-      blas.BatchedGEMM(CblasTrans,
-                       trans_y ? CblasTrans : CblasNoTrans,
-                       M,
-                       N,
-                       K,
-                       static_cast<T>(1),
-                       x_data,
-                       y_data,
-                       static_cast<T>(flag),
-                       dev_ctx.template Alloc<T>(Out),
-                       out_batch_size,
-                       M * K,
-                       0);
-    }
-  } else if (!is_broadcast_dims) {
-    VLOG(3) << "MatMul's case 13";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans,
-                     M,
-                     N,
-                     K,
-                     static_cast<T>(1),
-                     x_data,
-                     y_data,
-                     static_cast<T>(flag),
-                     dev_ctx.template Alloc<T>(Out),
-                     out_batch_size,
-                     M * K,
-                     K * N);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const T*> x_ptr(out_batch_size);
-    std::vector<const T*> y_ptr(out_batch_size);
-    std::vector<T*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = dev_ctx.template Alloc<T>(Out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    VLOG(3) << "MatMul's case 14";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans,
-                     M,
-                     N,
-                     K,
-                     static_cast<T>(1),
-                     x_ptr.data(),
-                     y_ptr.data(),
-                     static_cast<T>(flag),
-                     out_ptr.data(),
-                     out_batch_size);
-  }
-}
-
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-// This is almost a copy from MatMulFunctionImplWithBlas,
-// compare cublas with cublasLt kernels when Matmul autotune is on
-template <typename Context, typename T>
-void MatMulFunctionImplWithCublasLt(
-    const Context& dev_ctx,
-    const DenseTensor& X,
-    const DenseTensor& Y,
-    const std::vector<std::int64_t>& x_dims,
-    const std::vector<std::int64_t>& y_dims,
-    DenseTensor* Out,
-    bool trans_x,
-    bool trans_y,
-    bool flag = false,
-    phi::funcs::MatmulPlanner* matmul_planner = nullptr) {
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-  const T* x_data = X.data<T>();
-  const T* y_data = Y.data<T>();
-  using blaslt = phi::funcs::MatmulWithCublasLt<T>;
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = X.numel();
-    const int N = Y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers,"
-            "when X/Y's dims =1. But received X has [%d] elements,"
-            "received Y has [%d] elements",
-            M,
-            N));
-
-    // MatMul's case 0  =>  vector * vector
-    Out->Resize(common::make_ddim({}));
-    dev_ctx.template Alloc<T>(Out);
-    VLOG(3) << "MatMul with blaslt case 1";
-    blaslt::Run(dev_ctx,
-                y_data,
-                x_data,
-                dev_ctx.template Alloc<T>(Out),
-                1,
-                1,
-                M,
-                false,
-                true,
-                matmul_planner);
-    return;
-  }
-
-  if (x_ndim == 1) {
-    const int N = X.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-    if (trans_y) {
-      const int M = Y.numel() / N;
-      VLOG(3) << "MatMul with blaslt 2";
-      blaslt::Run(dev_ctx,
-                  y_data,
-                  x_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  matmul_planner);
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = Y.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul with blaslt 3";
-        blaslt::Run(dev_ctx,
-                    y_data,
-                    x_data,
-                    dev_ctx.template Alloc<T>(Out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    matmul_planner);
-      } else {
-        VLOG(3) << "MatMul with blaslt 4";
-        blaslt::RunWithBatch(dev_ctx,
-                             y_data,
-                             x_data,
-                             dev_ctx.template Alloc<T>(Out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             matmul_planner);
-      }
-    }
-    return;
-  }
-
-  if (y_ndim == 1) {
-    const int N = Y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = X.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul with blaslt 5";
-        blaslt::Run(dev_ctx,
-                    x_data,
-                    y_data,
-                    dev_ctx.template Alloc<T>(Out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    matmul_planner);
-      } else {
-        VLOG(3) << "MatMul with blaslt 6";
-        blaslt::RunWithBatch(dev_ctx,
-                             x_data,
-                             y_data,
-                             dev_ctx.template Alloc<T>(Out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             matmul_planner);
-      }
-    } else {
-      const int M = X.numel() / N;
-      VLOG(3) << "MatMul with blaslt 7";
-      blaslt::Run(dev_ctx,
-                  x_data,
-                  y_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  matmul_planner);
-    }
-    return;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  Out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  dev_ctx.template Alloc<T>(Out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return;
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    VLOG(3) << "MatMul with blaslt 8";
-    blaslt::Run(dev_ctx,
-                x_data,
-                y_data,
-                dev_ctx.template Alloc<T>(Out),
-                M,
-                N,
-                K,
-                trans_x,
-                trans_y,
-                matmul_planner);
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      VLOG(3) << "MatMul with blaslt 9";
-      blaslt::Run(dev_ctx,
-                  y_data,
-                  x_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  y_batch_size * N,
-                  1,
-                  K,
-                  false,
-                  false,
-                  matmul_planner);
-    } else {
-      VLOG(3) << "MatMul with blaslt 10";
-      blaslt::RunWithBatch(dev_ctx,
-                           x_data,
-                           y_data,
-                           dev_ctx.template Alloc<T>(Out),
-                           M,
-                           N,
-                           K,
-                           trans_x,
-                           trans_y,
-                           out_batch_size,
-                           0,
-                           K * N,
-                           M * N,
-                           matmul_planner);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      VLOG(3) << "MatMul with blaslt 11";
-      blaslt::Run(dev_ctx,
-                  x_data,
-                  y_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  x_batch_size * M,
-                  N,
-                  K,
-                  false,
-                  trans_y,
-                  matmul_planner);
-    } else {
-      VLOG(3) << "MatMul with blaslt 12";
-      blaslt::RunWithBatch(dev_ctx,
-                           x_data,
-                           y_data,
-                           dev_ctx.template Alloc<T>(Out),
-                           M,
-                           N,
-                           K,
-                           true,
-                           trans_y,
-                           out_batch_size,
-                           M * K,
-                           0,
-                           M * N,
-                           matmul_planner);
-    }
-  } else if (!is_broadcast_dims) {
-    VLOG(3) << "MatMul with blaslt 13";
-    blaslt::RunWithBatch(dev_ctx,
-                         x_data,
-                         y_data,
-                         dev_ctx.template Alloc<T>(Out),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         M * K,
-                         K * N,
-                         M * N,
-                         matmul_planner);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const T*> x_ptr(out_batch_size);
-    std::vector<const T*> y_ptr(out_batch_size);
-    std::vector<T*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = dev_ctx.template Alloc<T>(Out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    VLOG(3) << "MatMul with blaslt 14";
-    blaslt::RunWithBatch(dev_ctx,
-                         x_ptr.data(),
-                         y_ptr.data(),
-                         out_ptr.data(),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         matmul_planner);
-  }
-}
-#endif
-
-template <typename Context, typename T>
-struct MatMulDispatcher {
-  void operator()(const Context& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  const std::vector<std::int64_t>& x_dims,
-                  const std::vector<std::int64_t>& y_dims,
-                  DenseTensor* out,
-                  bool trans_x,
-                  bool trans_y,
-                  bool flag = false) {
-    MatMulFunctionImplWithBlas<Context, T>(
-        ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-  }
-};
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-struct MatMulDispatcher<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  const std::vector<std::int64_t>& x_dims,
-                  const std::vector<std::int64_t>& y_dims,
-                  DenseTensor* out,
-                  bool trans_x,
-                  bool trans_y,
-                  bool flag = false) {
-#if CUDA_VERSION >= 11060 && 0
-    auto* tuner = phi::autotune::MakeMatmulTuner<T>(
-        MatMulFunctionImplWithBlas<phi::GPUContext, T>);
-    tuner->AddCallBack(MatMulFunctionImplWithCublasLt<phi::GPUContext, T>);
-    phi::funcs::MatmulPlanner matmul_planner(x_dims,
-                                             y_dims,
-                                             trans_x,
-                                             trans_y,
-                                             phi::CppTypeToDataType<T>::Type(),
-                                             funcs::MatmulFusedType::kMatmul,
-                                             /* bias_data */ nullptr,
-                                             /* reserve_data */ nullptr,
-                                             /* use_addto */ flag,
-                                             /* no_exchange */ true);
-    tuner->Run(ctx,
-               matmul_planner.GetKey(),
-               ctx,
-               x,
-               y,
-               x_dims,
-               y_dims,
-               out,
-               trans_x,
-               trans_y,
-               flag,
-               &matmul_planner);
-#else
-    MatMulFunctionImplWithBlas<phi::GPUContext, T>(
-        ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-#endif
-  }
-};
-
-#endif  // PADDLE_WITH_CUDA
-
-template <typename Context, typename T>
-void MatMulFunction(const Context& ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    const std::vector<std::int64_t>& x_dims,
-                    const std::vector<std::int64_t>& y_dims,
-                    DenseTensor* out,
-                    bool trans_x,
-                    bool trans_y,
-                    bool flag = false) {
-  MatMulDispatcher<Context, T>()(
-      ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-}
-
-template <typename Context>
-bool MatMulInt8Function(const Context& ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const std::vector<std::int64_t>& x_dims,
-                        const std::vector<std::int64_t>& y_dims,
-                        DenseTensor* out,
-                        bool trans_x,
-                        bool trans_y) {
-  return false;
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-bool inline MatMulInt8Function(const phi::GPUContext& ctx,
-                               const DenseTensor& x,
-                               const DenseTensor& y,
-                               const std::vector<std::int64_t>& x_dims,
-                               const std::vector<std::int64_t>& y_dims,
-                               DenseTensor* out,
-                               bool trans_x,
-                               bool trans_y) {
-  if (x.dtype() != DataType::INT8 || y.dtype() != DataType::INT8) {
-    return false;
-  }
-#if CUDA_VERSION >= 11060 && 0
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-  const int8_t* x_data = x.data<int8_t>();
-  const int8_t* y_data = y.data<int8_t>();
-  using blaslt = phi::funcs::MatmulWithCublasLt<int8_t, int32_t>;
-
-  phi::funcs::MatmulPlanner matmul_planner(
-      x_dims,
-      y_dims,
-      trans_x,
-      trans_y,
-      phi::CppTypeToDataType<int8_t>::Type(),
-      funcs::MatmulFusedType::kMatmul,
-      /* bias_data */ nullptr,
-      /* reserve_data */ nullptr,
-      /* use_addto */ false,
-      /* no_exchange */ true);
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = x.numel();
-    const int N = y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers,"
-            "when X/Y's dims =1. But received X has [%d] elements,"
-            "received Y has [%d] elements",
-            M,
-            N));
-    if (!(M % 4 == 0)) {
-      return false;
-    }
-
-    out->Resize(common::make_ddim({}));
-    ctx.template Alloc<int32_t>(out);
-    blaslt::Run(ctx,
-                y_data,
-                x_data,
-                ctx.template Alloc<int32_t>(out),
-                1,
-                1,
-                M,
-                false,
-                true,
-                &matmul_planner);
-    return true;
-  }
-  if (x_ndim == 1) {
-    const int N = x.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-      if (!(N % 4 == 0)) {
-        return false;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-      const int M = y.numel() / N;
-      if (!(M == 1 || M % 4 == 0)) {
-        return false;
-      }
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    out->ResizeAndAllocate(common::make_ddim(out_dims));
-    ctx.template Alloc<int32_t>(out);
-    if (trans_y) {
-      const int M = y.numel() / N;
-      blaslt::Run(ctx,
-                  y_data,
-                  x_data,
-                  ctx.template Alloc<int32_t>(out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  &matmul_planner);
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = y.numel() / (M * N);
-      if (batch_size == 1) {
-        blaslt::Run(ctx,
-                    y_data,
-                    x_data,
-                    ctx.template Alloc<int32_t>(out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    &matmul_planner);
-      } else {
-        blaslt::RunWithBatch(ctx,
-                             y_data,
-                             x_data,
-                             ctx.template Alloc<int32_t>(out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             &matmul_planner);
-      }
-    }
-    return true;
-  }
-
-  if (y_ndim == 1) {
-    const int N = y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-      const int M = x.numel() / N;
-      if (!((M == 1 || M % 4 == 0))) {
-        return false;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-      if (N % 4 != 0) {
-        return false;
-      }
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    out->ResizeAndAllocate(common::make_ddim(out_dims));
-    ctx.template Alloc<int32_t>(out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = x.numel() / (M * N);
-      if (batch_size == 1) {
-        blaslt::Run(ctx,
-                    x_data,
-                    y_data,
-                    ctx.template Alloc<int32_t>(out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    &matmul_planner);
-      } else {
-        blaslt::RunWithBatch(ctx,
-                             x_data,
-                             y_data,
-                             ctx.template Alloc<int32_t>(out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             &matmul_planner);
-      }
-    } else {
-      const int M = x.numel() / N;
-      blaslt::Run(ctx,
-                  x_data,
-                  y_data,
-                  ctx.template Alloc<int32_t>(out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  &matmul_planner);
-    }
-    return true;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  ctx.template Alloc<int32_t>(out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return true;
-
-  if (x_batch_size == 1 && M == 1 && trans_y) {
-    if (!(K % 4 == 0)) {
-      return false;
-    }
-  } else if (!trans_x && !trans_y) {
-    if (!(N % 4 == 0 || N == 1) || !(K % 4 == 0) || (M == 1 && N == 1)) {
-      return false;
-    }
-  } else if (!trans_x && trans_y) {
-    if (!(K % 4 == 0)) {
-      return false;
-    }
-  } else if (trans_x && !trans_y) {
-    if (!(M % 4 == 0 || M == 1) || !(N % 4 == 0 || N == 1)) {
-      return false;
-    }
-  } else {
-    if (!(M % 4 == 0 || M == 1) || !(K % 4 == 0)) {
-      return false;
-    }
-  }
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    blaslt::Run(ctx,
-                x_data,
-                y_data,
-                ctx.template Alloc<int32_t>(out),
-                M,
-                N,
-                K,
-                trans_x,
-                trans_y,
-                &matmul_planner);
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      blaslt::Run(ctx,
-                  y_data,
-                  x_data,
-                  ctx.template Alloc<int32_t>(out),
-                  y_batch_size * N,
-                  1,
-                  K,
-                  false,
-                  false,
-                  &matmul_planner);
-    } else {
-      blaslt::RunWithBatch(ctx,
-                           x_data,
-                           y_data,
-                           ctx.template Alloc<int32_t>(out),
-                           M,
-                           N,
-                           K,
-                           trans_x,
-                           trans_y,
-                           out_batch_size,
-                           0,
-                           K * N,
-                           M * N,
-                           &matmul_planner);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      blaslt::Run(ctx,
-                  x_data,
-                  y_data,
-                  ctx.template Alloc<int32_t>(out),
-                  x_batch_size * M,
-                  N,
-                  K,
-                  false,
-                  trans_y,
-                  &matmul_planner);
-    } else {
-      blaslt::RunWithBatch(ctx,
-                           x_data,
-                           y_data,
-                           ctx.template Alloc<int32_t>(out),
-                           M,
-                           N,
-                           K,
-                           true,
-                           trans_y,
-                           out_batch_size,
-                           M * K,
-                           0,
-                           M * N,
-                           &matmul_planner);
-    }
-  } else if (!is_broadcast_dims) {
-    blaslt::RunWithBatch(ctx,
-                         x_data,
-                         y_data,
-                         ctx.template Alloc<int32_t>(out),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         M * K,
-                         K * N,
-                         M * N,
-                         &matmul_planner);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const int8_t*> x_ptr(out_batch_size);
-    std::vector<const int8_t*> y_ptr(out_batch_size);
-    std::vector<int32_t*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = ctx.template Alloc<int32_t>(out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    blaslt::RunWithBatch(ctx,
-                         x_ptr.data(),
-                         y_ptr.data(),
-                         out_ptr.data(),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         &matmul_planner);
-  }
-  return true;
-#else
-  return false;
-#endif
-}
-#endif
-
-template <typename Context, typename T>
-typename std::enable_if<std::is_integral<T>::value>::type
-MatmulJudgeDtypeKernel(const Context& ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       const std::vector<std::int64_t>& x_dims,
-                       const std::vector<std::int64_t>& y_dims,
-                       DenseTensor* out,
-                       bool transpose_x,
-                       bool transpose_y) {
-  bool try_matmul_int8 = MatMulInt8Function<Context>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-  if (try_matmul_int8) {
-    return;
-  }
-  auto x_tmp = phi::Cast<T, Context>(ctx, x, phi::DataType::FLOAT32);
-  auto y_tmp = phi::Cast<T, Context>(ctx, y, phi::DataType::FLOAT32);
-  DenseTensor out_tmp;
-  MatMulFunction<Context, float>(
-      ctx, x_tmp, y_tmp, x_dims, y_dims, &out_tmp, transpose_x, transpose_y);
-  if (x.dtype() == phi::DataType::INT8) {
-    phi::CastKernel<float>(ctx, out_tmp, phi::DataType::INT32, out);
-    return;
-  }
-  phi::CastKernel<float>(ctx, out_tmp, x.dtype(), out);
-}
-
-template <typename Context, typename T>
-typename std::enable_if<!std::is_integral<T>::value>::type
-MatmulJudgeDtypeKernel(const Context& ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       const std::vector<std::int64_t>& x_dims,
-                       const std::vector<std::int64_t>& y_dims,
-                       DenseTensor* out,
-                       bool transpose_x,
-                       bool transpose_y) {
-  MatMulFunction<Context, T>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-}
-
-template <typename T, typename Context>
-void MatmulKernel(const Context& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  bool transpose_x,
-                  bool transpose_y,
-                  DenseTensor* out) {
-  if (x.numel() == 0 || y.numel() == 0) {
-    // input shape [1, 1, 5, 0], [1, 1, 0, 5], result shape is [1, 1, 5, 5]
-    phi::Full<T, Context>(
-        ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
-    return;
-  }
-  PADDLE_ENFORCE_GE(
-      common::product(x.dims()),
-      0,
-      common::errors::InvalidArgument(
-          "The dims of Input(X) should be greater than or equal to 0."));
-  PADDLE_ENFORCE_GE(
-      common::product(y.dims()),
-      0,
-      common::errors::InvalidArgument(
-          "The dims of Input(Y) should be greater than or equal to 0."));
-  const std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
-  const std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
-  MatmulJudgeDtypeKernel<Context, T>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenKernelImpl(const Context& dev_ctx,
-                                 const DenseTensor& x,
-                                 const DenseTensor& y,
-                                 int x_num_col_dims,
-                                 int y_num_col_dims,
-                                 DenseTensor* out) {
-  const DenseTensor x_matrix =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  const DenseTensor y_matrix =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-
-  dev_ctx.template Alloc<T>(out);
-  auto z_dim = out->dims();
-  if (z_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  blas.MatMul(x_matrix, y_matrix, out);
-  if (z_dim.size() != 2) {
-    out->Resize(z_dim);
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-template <typename Context>
-void MatmulWithFlattenKernelInt8Impl(const Context& dev_ctx,
-                                     const DenseTensor& x,
-                                     const DenseTensor& y,
-                                     int x_num_col_dims,
-                                     int y_num_col_dims,
-                                     DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(
-      x.dtype(),
-      DataType::INT8,
-      phi::errors::InvalidArgument(
-          "The type of input(x) used in int8 mul must be (%s) "
-          "does not match the "
-          "type of data (%s) currently contained in the container.",
-          phi::CppTypeToDataType<int8_t>::Type(),
-          x.dtype()));
-  PADDLE_ENFORCE_EQ(
-      y.dtype(),
-      DataType::INT8,
-      phi::errors::InvalidArgument(
-          "The type of input(y) used in int8 mul must be (%s) "
-          "does not match the "
-          "type of data (%s) currently contained in the container.",
-          phi::CppTypeToDataType<int8_t>::Type(),
-          y.dtype()));
-
-  const DenseTensor x_matrix =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  const DenseTensor y_matrix =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-
-  PADDLE_ENFORCE_EQ(
-      x_matrix.dims()[1],
-      y_matrix.dims()[0],
-      phi::errors::InvalidArgument(
-          "X's numbers of columns must be equal to Y's numbers of rows."
-          "But received X has [%d] columns,"
-          "received Y has [%d] rows",
-          x_matrix.dims()[1],
-          y_matrix.dims()[0]));
-
-  PADDLE_ENFORCE_EQ((y_matrix.dims()[1] % 4 == 0 || y_matrix.dims()[1] == 1),
-                    true,
-                    phi::errors::InvalidArgument(
-                        "The dimension size N used in int8 mul must be 1"
-                        "or a multiple of 4 does not match the size (%d)"
-                        "currently contained in the container.",
-                        y_matrix.dims()[1]));
-  PADDLE_ENFORCE_EQ((x_matrix.dims()[1] % 4 == 0),
-                    true,
-                    phi::errors::InvalidArgument(
-                        "The dimension size K used in int8 mul must be a"
-                        "multiple of 4 does not match the size (%d) currently"
-                        "contained in the container.",
-                        x_matrix.dims()[1]));
-
-  dev_ctx.template Alloc<int32_t>(out);
-  auto z_dim = out->dims();
-  if (z_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-
-#if CUDA_VERSION >= 11060 && 0
-  using blaslt = phi::funcs::MatmulWithCublasLt<int8_t, int32_t>;
-
-  const int8_t* x_data = x_matrix.data<int8_t>();
-  const int8_t* y_data = y_matrix.data<int8_t>();
-
-  std::vector<std::int64_t> x_dims = {x_matrix.dims()[0], x_matrix.dims()[1]};
-  std::vector<std::int64_t> y_dims = {y_matrix.dims()[0], y_matrix.dims()[1]};
-  phi::funcs::MatmulPlanner matmul_planner(
-      x_dims,
-      y_dims,
-      false,
-      false,
-      phi::CppTypeToDataType<int8_t>::Type(),
-      funcs::MatmulFusedType::kMatmul,
-      /* bias_data */ nullptr,
-      /* reserve_data */ nullptr,
-      /* use_addto */ false,
-      /* no_exchange */ true);
-
-  blaslt::Run(dev_ctx,
-              x_data,
-              y_data,
-              dev_ctx.template Alloc<int32_t>(out),
-              x_matrix.dims()[0],
-              y_matrix.dims()[1],
-              x_matrix.dims()[1],
-              false,
-              false,
-              &matmul_planner);
-
-  if (z_dim.size() != 2) {
-    out->Resize(z_dim);
-  }
-#endif
-}
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-template <typename Context>
-typename std::enable_if<std::is_same<Context, phi::GPUContext>::value,
-                        void>::type
-DispatchMatmulWithFlattenInt8Kernel(const phi::GPUContext& dev_ctx,
-                                    const DenseTensor& x,
-                                    const DenseTensor& y,
-                                    int x_num_col_dims,
-                                    int y_num_col_dims,
-                                    DenseTensor* out) {
-  MatmulWithFlattenKernelInt8Impl<Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-#endif
-
-template <typename Context>
-typename std::enable_if<std::is_same<Context, phi::CPUContext>::value,
-                        void>::type
-DispatchMatmulWithFlattenInt8Kernel(const phi::CPUContext& dev_ctx,
-                                    const DenseTensor& x,
-                                    const DenseTensor& y,
-                                    int x_num_col_dims,
-                                    int y_num_col_dims,
-                                    DenseTensor* out) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "MatmulWithFlatten with CPU is NOT implemented "
-      "yet."));
-}
-
-template <typename T, typename Context>
-typename std::enable_if<std::is_same<T, int8_t>::value, void>::type
-DispatchMatmulFlattenKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            int x_num_col_dims,
-                            int y_num_col_dims,
-                            DenseTensor* out) {
-  DispatchMatmulWithFlattenInt8Kernel<Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-template <typename T, typename Context>
-typename std::enable_if<!std::is_same<T, int8_t>::value, void>::type
-DispatchMatmulFlattenKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            int x_num_col_dims,
-                            int y_num_col_dims,
-                            DenseTensor* out) {
-  MatmulWithFlattenKernelImpl<T, Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int x_num_col_dims,
-                             int y_num_col_dims,
-                             DenseTensor* out) {
-  DispatchMatmulFlattenKernel<T, Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-template <typename T, typename Context>
-void LegacyMatmulKernel(const Context& ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        bool transpose_x,
-                        bool transpose_y,
-                        float alpha,
-                        DenseTensor* out) {
-  MatmulKernel<T, Context>(ctx, x, y, transpose_x, transpose_y, out);
-  if (std::fabs(alpha - 1.f) > 1e-6f) {
-    ScaleKernel<T, Context>(ctx, *out, Scalar(alpha), Scalar(0), false, out);
-  }
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/matmul_kernel_impl_maca.h b/backends/metax_gpu/kernels/impl/matmul_kernel_impl_maca.h
deleted file mode 100644
index 9750abae5ca..00000000000
--- a/backends/metax_gpu/kernels/impl/matmul_kernel_impl_maca.h
+++ /dev/null
@@ -1,1696 +0,0 @@
-// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights
-// Reserved.
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-// clang-format off
-#include "glog/logging.h"
-
-#include "paddle/phi/common/memory_utils.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/kernels/autotune/cache_base.h"
-#include "paddle/phi/kernels/cast_kernel.h"
-#include "../funcs/blas/blas.h"
-#ifdef PADDLE_WITH_HIP
-#include "paddle/phi/kernels/funcs/blas/blaslt_impl.hip.h"
-#else
-#include "../funcs/blas/blaslt_impl.cu.h"
-#endif
-#include "paddle/phi/kernels/funcs/complex_functors.h"
-#include "paddle/phi/kernels/scale_kernel.h"
-#if defined(PADDLE_WITH_CUDA)
-#include "paddle/phi/kernels/funcs/cublaslt.h"
-#include "paddle/phi/kernels/gpu/cuda_gemm_kernel.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
-#elif defined(PADDLE_WITH_HIP)
-#include "paddle/phi/kernels/funcs/hipblaslt.h"
-#endif
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-#include "paddle/phi/kernels/autotune/auto_tune_base.h"
-#endif
-// clang-format on
-namespace phi {
-
-static void GetBroadcastFromDims(const int x_ndim,
-                                 const std::int64_t* x_dims,
-                                 const int y_ndim,
-                                 const std::int64_t* y_dims,
-                                 std::int64_t* x_bd_dims,
-                                 std::int64_t* y_bd_dims,
-                                 std::int64_t* out_bd_dims) {
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::fill(x_bd_dims, x_bd_dims + ndim - x_ndim, 1);
-  std::fill(y_bd_dims, y_bd_dims + ndim - y_ndim, 1);
-  std::copy(x_dims, x_dims + x_ndim, x_bd_dims + ndim - x_ndim);
-  std::copy(y_dims, y_dims + y_ndim, y_bd_dims + ndim - y_ndim);
-
-  for (int i = 0; i < ndim; ++i) {
-    PADDLE_ENFORCE_EQ(
-        x_bd_dims[i] == y_bd_dims[i] || x_bd_dims[i] <= 1 || y_bd_dims[i] <= 1,
-        true,
-        phi::errors::InvalidArgument(
-            "Input(X) and Input(Y) has error dim. "
-            "X_broadcast's shape[%s] must be equal to Y_broadcast's shape[%s], "
-            "or X_broadcast's shape[%s] <= 1, or Y_broadcast's shape[%s] <= 1, "
-            "but received X_broadcast's shape[%s] = [%s]"
-            "received Y_broadcast's shape[%s] = [%s].",
-            i,
-            i,
-            i,
-            i,
-            i,
-            x_bd_dims[i],
-            i,
-            y_bd_dims[i]));
-    if (x_bd_dims[i] == 0 || y_bd_dims[i] == 0) {
-      out_bd_dims[i] = 0;
-    } else {
-      out_bd_dims[i] = (std::max)(x_bd_dims[i], y_bd_dims[i]);
-    }
-  }
-}
-
-static int64_t GetIndexMessage(const int n,
-                               const int64_t* dims,
-                               const int64_t* index) {
-  int64_t sum = 0;
-  for (int i = 0; i < n; ++i) {
-    if (dims[i] > 1) {
-      sum = sum * dims[i] + index[i];
-    }
-  }
-  return sum;
-}
-
-static void IndexIncreaseFromDims(const int ndim,
-                                  const int64_t* dims,
-                                  int64_t* index) {
-  for (int i = ndim - 1; i >= 0; --i) {
-    ++index[i];
-    if (index[i] >= dims[i]) {
-      index[i] -= dims[i];
-    } else {
-      break;
-    }
-  }
-}
-
-// The general implementation with blas.
-template <typename Context, typename T>
-void MatMulFunctionImplWithBlas(
-    const Context& dev_ctx,
-    const DenseTensor& X,
-    const DenseTensor& Y,
-    const std::vector<std::int64_t>& x_dims,
-    const std::vector<std::int64_t>& y_dims,
-    DenseTensor* Out,
-    bool trans_x,
-    bool trans_y,
-    bool flag = false,
-    phi::funcs::MatmulPlanner* matmul_planner UNUSED = nullptr) {
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-
-  // Get data ptr
-  const T* x_data = X.data<T>();
-  const T* y_data = Y.data<T>();
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = X.numel();
-    const int N = Y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers, "
-            "when X/Y's dims =1. But received X has [%d] elements, "
-            "received Y has [%d] elements.",
-            M,
-            N));
-    VLOG(3) << "MatMul's case 1";
-    Out->Resize(common::make_ddim({}));
-    dev_ctx.template Alloc<T>(Out);
-    blas.GEMM(CblasNoTrans,
-              CblasTrans,
-              1,
-              1,
-              M,
-              static_cast<T>(1),
-              y_data,
-              x_data,
-              static_cast<T>(flag),
-              dev_ctx.template Alloc<T>(Out));
-    return;
-  }
-
-  if (x_ndim == 1) {
-    const int N = X.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-    if (trans_y) {
-      const int M = Y.numel() / N;
-      VLOG(3) << "MatMul's case 2";
-      blas.GEMV(false,
-                M,
-                N,
-                static_cast<T>(1),
-                y_data,
-                x_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = Y.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul's case 3";
-        blas.GEMV(true,
-                  N,
-                  M,
-                  static_cast<T>(1),
-                  y_data,
-                  x_data,
-                  static_cast<T>(flag),
-                  dev_ctx.template Alloc<T>(Out));
-      } else {
-        VLOG(3) << "MatMul's case 4";
-        blas.BatchedGEMM(CblasTrans,
-                         CblasNoTrans,
-                         M,
-                         1,
-                         N,
-                         static_cast<T>(1),
-                         y_data,
-                         x_data,
-                         static_cast<T>(flag),
-                         dev_ctx.template Alloc<T>(Out),
-                         batch_size,
-                         M * N,
-                         0);
-      }
-    }
-    return;
-  }
-
-  if (y_ndim == 1) {
-    const int N = Y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = X.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul's case 5";
-        blas.GEMV(true,
-                  N,
-                  M,
-                  static_cast<T>(1),
-                  x_data,
-                  y_data,
-                  static_cast<T>(flag),
-                  dev_ctx.template Alloc<T>(Out));
-      } else {
-        VLOG(3) << "MatMul's case 6";
-        blas.BatchedGEMM(CblasTrans,
-                         CblasNoTrans,
-                         M,
-                         1,
-                         N,
-                         static_cast<T>(1),
-                         x_data,
-                         y_data,
-                         static_cast<T>(flag),
-                         dev_ctx.template Alloc<T>(Out),
-                         batch_size,
-                         M * N,
-                         0);
-      }
-    } else {
-      const int M = X.numel() / N;
-      VLOG(3) << "MatMul's case 7";
-      blas.GEMV(false,
-                M,
-                N,
-                static_cast<T>(1),
-                x_data,
-                y_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    }
-    return;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  Out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  dev_ctx.template Alloc<T>(Out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return;
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    VLOG(3) << "MatMul's case 8";
-    blas.GEMM(trans_x ? CblasTrans : CblasNoTrans,
-              trans_y ? CblasTrans : CblasNoTrans,
-              M,
-              N,
-              K,
-              static_cast<T>(1),
-              x_data,
-              y_data,
-              static_cast<T>(flag),
-              dev_ctx.template Alloc<T>(Out));
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      VLOG(3) << "MatMul's case 9";
-      blas.GEMV(false,
-                y_batch_size * N,
-                K,
-                static_cast<T>(1),
-                y_data,
-                x_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      VLOG(3) << "MatMul's case 10";
-      blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                       trans_y ? CblasTrans : CblasNoTrans,
-                       M,
-                       N,
-                       K,
-                       static_cast<T>(1),
-                       x_data,
-                       y_data,
-                       static_cast<T>(flag),
-                       dev_ctx.template Alloc<T>(Out),
-                       out_batch_size,
-                       0,
-                       K * N);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      VLOG(3) << "MatMul's case 11";
-      blas.GEMM(CblasNoTrans,
-                trans_y ? CblasTrans : CblasNoTrans,
-                x_batch_size * M,
-                N,
-                K,
-                static_cast<T>(1),
-                x_data,
-                y_data,
-                static_cast<T>(flag),
-                dev_ctx.template Alloc<T>(Out));
-    } else {
-      VLOG(3) << "MatMul's case 12";
-      blas.BatchedGEMM(CblasTrans,
-                       trans_y ? CblasTrans : CblasNoTrans,
-                       M,
-                       N,
-                       K,
-                       static_cast<T>(1),
-                       x_data,
-                       y_data,
-                       static_cast<T>(flag),
-                       dev_ctx.template Alloc<T>(Out),
-                       out_batch_size,
-                       M * K,
-                       0);
-    }
-  } else if (!is_broadcast_dims) {
-    VLOG(3) << "MatMul's case 13";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans,
-                     M,
-                     N,
-                     K,
-                     static_cast<T>(1),
-                     x_data,
-                     y_data,
-                     static_cast<T>(flag),
-                     dev_ctx.template Alloc<T>(Out),
-                     out_batch_size,
-                     M * K,
-                     K * N);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const T*> x_ptr(out_batch_size);
-    std::vector<const T*> y_ptr(out_batch_size);
-    std::vector<T*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = dev_ctx.template Alloc<T>(Out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    VLOG(3) << "MatMul's case 14";
-    blas.BatchedGEMM(trans_x ? CblasTrans : CblasNoTrans,
-                     trans_y ? CblasTrans : CblasNoTrans,
-                     M,
-                     N,
-                     K,
-                     static_cast<T>(1),
-                     x_ptr.data(),
-                     y_ptr.data(),
-                     static_cast<T>(flag),
-                     out_ptr.data(),
-                     out_batch_size);
-  }
-}
-
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
-// This is almost a copy from MatMulFunctionImplWithBlas,
-// compare cublas with cublasLt kernels when Matmul autotune is on
-template <typename Context, typename T>
-void MatMulFunctionImplWithCublasLt(
-    const Context& dev_ctx,
-    const DenseTensor& X,
-    const DenseTensor& Y,
-    const std::vector<std::int64_t>& x_dims,
-    const std::vector<std::int64_t>& y_dims,
-    DenseTensor* Out,
-    bool trans_x,
-    bool trans_y,
-    bool flag = false,
-    phi::funcs::MatmulPlanner* matmul_planner = nullptr) {
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-  const T* x_data = X.data<T>();
-  const T* y_data = Y.data<T>();
-  using blaslt = phi::funcs::MatmulWithCublasLt<T>;
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = X.numel();
-    const int N = Y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers,"
-            "when X/Y's dims =1. But received X has [%d] elements,"
-            "received Y has [%d] elements",
-            M,
-            N));
-
-    // MatMul's case 0  =>  vector * vector
-    Out->Resize(common::make_ddim({}));
-    dev_ctx.template Alloc<T>(Out);
-    VLOG(3) << "MatMul with blaslt case 1";
-    blaslt::Run(dev_ctx,
-                y_data,
-                x_data,
-                dev_ctx.template Alloc<T>(Out),
-                1,
-                1,
-                M,
-                false,
-                true,
-                matmul_planner);
-    return;
-  }
-
-  if (x_ndim == 1) {
-    const int N = X.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-    if (trans_y) {
-      const int M = Y.numel() / N;
-      VLOG(3) << "MatMul with blaslt 2";
-      blaslt::Run(dev_ctx,
-                  y_data,
-                  x_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  matmul_planner);
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = Y.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul with blaslt 3";
-        blaslt::Run(dev_ctx,
-                    y_data,
-                    x_data,
-                    dev_ctx.template Alloc<T>(Out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    matmul_planner);
-      } else {
-        VLOG(3) << "MatMul with blaslt 4";
-        blaslt::RunWithBatch(dev_ctx,
-                             y_data,
-                             x_data,
-                             dev_ctx.template Alloc<T>(Out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             matmul_planner);
-      }
-    }
-    return;
-  }
-
-  if (y_ndim == 1) {
-    const int N = Y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    Out->ResizeAndAllocate(common::make_ddim(out_dims));
-    dev_ctx.template Alloc<T>(Out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = X.numel() / (M * N);
-      if (batch_size == 1) {
-        VLOG(3) << "MatMul with blaslt 5";
-        blaslt::Run(dev_ctx,
-                    x_data,
-                    y_data,
-                    dev_ctx.template Alloc<T>(Out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    matmul_planner);
-      } else {
-        VLOG(3) << "MatMul with blaslt 6";
-        blaslt::RunWithBatch(dev_ctx,
-                             x_data,
-                             y_data,
-                             dev_ctx.template Alloc<T>(Out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             matmul_planner);
-      }
-    } else {
-      const int M = X.numel() / N;
-      VLOG(3) << "MatMul with blaslt 7";
-      blaslt::Run(dev_ctx,
-                  x_data,
-                  y_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  matmul_planner);
-    }
-    return;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  Out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  dev_ctx.template Alloc<T>(Out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return;
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    VLOG(3) << "MatMul with blaslt 8";
-    blaslt::Run(dev_ctx,
-                x_data,
-                y_data,
-                dev_ctx.template Alloc<T>(Out),
-                M,
-                N,
-                K,
-                trans_x,
-                trans_y,
-                matmul_planner);
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      VLOG(3) << "MatMul with blaslt 9";
-      blaslt::Run(dev_ctx,
-                  y_data,
-                  x_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  y_batch_size * N,
-                  1,
-                  K,
-                  false,
-                  false,
-                  matmul_planner);
-    } else {
-      VLOG(3) << "MatMul with blaslt 10";
-      blaslt::RunWithBatch(dev_ctx,
-                           x_data,
-                           y_data,
-                           dev_ctx.template Alloc<T>(Out),
-                           M,
-                           N,
-                           K,
-                           trans_x,
-                           trans_y,
-                           out_batch_size,
-                           0,
-                           K * N,
-                           M * N,
-                           matmul_planner);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      VLOG(3) << "MatMul with blaslt 11";
-      blaslt::Run(dev_ctx,
-                  x_data,
-                  y_data,
-                  dev_ctx.template Alloc<T>(Out),
-                  x_batch_size * M,
-                  N,
-                  K,
-                  false,
-                  trans_y,
-                  matmul_planner);
-    } else {
-      VLOG(3) << "MatMul with blaslt 12";
-      blaslt::RunWithBatch(dev_ctx,
-                           x_data,
-                           y_data,
-                           dev_ctx.template Alloc<T>(Out),
-                           M,
-                           N,
-                           K,
-                           true,
-                           trans_y,
-                           out_batch_size,
-                           M * K,
-                           0,
-                           M * N,
-                           matmul_planner);
-    }
-  } else if (!is_broadcast_dims) {
-    VLOG(3) << "MatMul with blaslt 13";
-    blaslt::RunWithBatch(dev_ctx,
-                         x_data,
-                         y_data,
-                         dev_ctx.template Alloc<T>(Out),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         M * K,
-                         K * N,
-                         M * N,
-                         matmul_planner);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const T*> x_ptr(out_batch_size);
-    std::vector<const T*> y_ptr(out_batch_size);
-    std::vector<T*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = dev_ctx.template Alloc<T>(Out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    VLOG(3) << "MatMul with blaslt 14";
-    blaslt::RunWithBatch(dev_ctx,
-                         x_ptr.data(),
-                         y_ptr.data(),
-                         out_ptr.data(),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         matmul_planner);
-  }
-}
-#endif
-
-template <typename Context, typename T>
-struct MatMulDispatcher {
-  void operator()(const Context& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  const std::vector<std::int64_t>& x_dims,
-                  const std::vector<std::int64_t>& y_dims,
-                  DenseTensor* out,
-                  bool trans_x,
-                  bool trans_y,
-                  bool flag = false) {
-    MatMulFunctionImplWithBlas<Context, T>(
-        ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-  }
-};
-
-#ifdef PADDLE_WITH_CUDA
-template <typename T>
-struct MatMulDispatcher<phi::GPUContext, T> {
-  void operator()(const phi::GPUContext& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  const std::vector<std::int64_t>& x_dims,
-                  const std::vector<std::int64_t>& y_dims,
-                  DenseTensor* out,
-                  bool trans_x,
-                  bool trans_y,
-                  bool flag = false) {
-#if CUDA_VERSION >= 11060 && 0
-    auto* tuner = phi::autotune::MakeMatmulTuner<T>(
-        MatMulFunctionImplWithBlas<phi::GPUContext, T>);
-    tuner->AddCallBack(MatMulFunctionImplWithCublasLt<phi::GPUContext, T>);
-    phi::funcs::MatmulPlanner matmul_planner(x_dims,
-                                             y_dims,
-                                             trans_x,
-                                             trans_y,
-                                             phi::CppTypeToDataType<T>::Type(),
-                                             funcs::MatmulFusedType::kMatmul,
-                                             /* bias_data */ nullptr,
-                                             /* reserve_data */ nullptr,
-                                             /* use_addto */ flag,
-                                             /* no_exchange */ true);
-    tuner->Run(ctx,
-               matmul_planner.GetKey(),
-               ctx,
-               x,
-               y,
-               x_dims,
-               y_dims,
-               out,
-               trans_x,
-               trans_y,
-               flag,
-               &matmul_planner);
-#else
-    MatMulFunctionImplWithBlas<phi::GPUContext, T>(
-        ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-#endif
-  }
-};
-
-#endif  // PADDLE_WITH_CUDA
-
-template <typename Context, typename T>
-void MatMulFunction(const Context& ctx,
-                    const DenseTensor& x,
-                    const DenseTensor& y,
-                    const std::vector<std::int64_t>& x_dims,
-                    const std::vector<std::int64_t>& y_dims,
-                    DenseTensor* out,
-                    bool trans_x,
-                    bool trans_y,
-                    bool flag = false) {
-  MatMulDispatcher<Context, T>()(
-      ctx, x, y, x_dims, y_dims, out, trans_x, trans_y, flag);
-}
-
-template <typename Context>
-bool MatMulInt8Function(const Context& ctx,
-                        const DenseTensor& x,
-                        const DenseTensor& y,
-                        const std::vector<std::int64_t>& x_dims,
-                        const std::vector<std::int64_t>& y_dims,
-                        DenseTensor* out,
-                        bool trans_x,
-                        bool trans_y) {
-  return false;
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-bool inline MatMulInt8Function(const phi::GPUContext& ctx,
-                               const DenseTensor& x,
-                               const DenseTensor& y,
-                               const std::vector<std::int64_t>& x_dims,
-                               const std::vector<std::int64_t>& y_dims,
-                               DenseTensor* out,
-                               bool trans_x,
-                               bool trans_y) {
-  if (x.dtype() != DataType::INT8 || y.dtype() != DataType::INT8) {
-    return false;
-  }
-#if CUDA_VERSION >= 11060 && 0
-  const int x_ndim = x_dims.size();
-  const int y_ndim = y_dims.size();
-  const int8_t* x_data = x.data<int8_t>();
-  const int8_t* y_data = y.data<int8_t>();
-  using blaslt = phi::funcs::MatmulWithCublasLt<int8_t, int32_t>;
-
-  phi::funcs::MatmulPlanner matmul_planner(
-      x_dims,
-      y_dims,
-      trans_x,
-      trans_y,
-      phi::CppTypeToDataType<int8_t>::Type(),
-      funcs::MatmulFusedType::kMatmul,
-      /* bias_data */ nullptr,
-      /* reserve_data */ nullptr,
-      /* use_addto */ false,
-      /* no_exchange */ true);
-
-  if (x_ndim == 1 && y_ndim == 1) {
-    const int M = x.numel();
-    const int N = y.numel();
-    PADDLE_ENFORCE_EQ(
-        M,
-        N,
-        phi::errors::InvalidArgument(
-            "X's numbers must be equal to Y's numbers,"
-            "when X/Y's dims =1. But received X has [%d] elements,"
-            "received Y has [%d] elements",
-            M,
-            N));
-    if (!(M % 4 == 0)) {
-      return false;
-    }
-
-    out->Resize(common::make_ddim({}));
-    ctx.template Alloc<int32_t>(out);
-    blaslt::Run(ctx,
-                y_data,
-                x_data,
-                ctx.template Alloc<int32_t>(out),
-                1,
-                1,
-                M,
-                false,
-                true,
-                &matmul_planner);
-    return true;
-  }
-  if (x_ndim == 1) {
-    const int N = x.numel();
-    if (trans_y) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 1,
-                                       N,
-                                       y_ndim - 1,
-                                       y_dims[y_ndim - 1]));
-      if (!(N % 4 == 0)) {
-        return false;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          y_dims[y_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                       "Y'dims[%d] must be equal to %d, "
-                                       "but received Y'dims[%d] is %d.",
-                                       y_ndim - 2,
-                                       N,
-                                       y_ndim - 2,
-                                       y_dims[y_ndim - 2]));
-      const int M = y.numel() / N;
-      if (!(M == 1 || M % 4 == 0)) {
-        return false;
-      }
-    }
-    std::vector<std::int64_t> out_dims(y_ndim - 1);
-    if (trans_y) {
-      std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin());
-    } else {
-      std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin());
-      out_dims.back() = y_dims.back();
-    }
-    out->ResizeAndAllocate(common::make_ddim(out_dims));
-    ctx.template Alloc<int32_t>(out);
-    if (trans_y) {
-      const int M = y.numel() / N;
-      blaslt::Run(ctx,
-                  y_data,
-                  x_data,
-                  ctx.template Alloc<int32_t>(out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  &matmul_planner);
-    } else {
-      const int M = y_dims[y_ndim - 1];
-      const int batch_size = y.numel() / (M * N);
-      if (batch_size == 1) {
-        blaslt::Run(ctx,
-                    y_data,
-                    x_data,
-                    ctx.template Alloc<int32_t>(out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    &matmul_planner);
-      } else {
-        blaslt::RunWithBatch(ctx,
-                             y_data,
-                             x_data,
-                             ctx.template Alloc<int32_t>(out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             &matmul_planner);
-      }
-    }
-    return true;
-  }
-
-  if (y_ndim == 1) {
-    const int N = y.numel();
-    if (trans_x) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 2],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 2,
-                                       N,
-                                       x_ndim - 2,
-                                       x_dims[x_ndim - 2]));
-      const int M = x.numel() / N;
-      if (!((M == 1 || M % 4 == 0))) {
-        return false;
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(
-          x_dims[x_ndim - 1],
-          N,
-          phi::errors::InvalidArgument("Input(X) has error dim."
-                                       "X'dims[%d] must be equal to %d"
-                                       "But received X'dims[%d] is %d",
-                                       x_ndim - 1,
-                                       N,
-                                       x_ndim - 1,
-                                       x_dims[x_ndim - 1]));
-      if (N % 4 != 0) {
-        return false;
-      }
-    }
-    std::vector<std::int64_t> out_dims(x_ndim - 1);
-    if (trans_x) {
-      std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin());
-      out_dims.back() = x_dims.back();
-    } else {
-      std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin());
-    }
-    out->ResizeAndAllocate(common::make_ddim(out_dims));
-    ctx.template Alloc<int32_t>(out);
-
-    if (trans_x) {
-      const int M = x_dims[x_ndim - 1];
-      const int batch_size = x.numel() / (M * N);
-      if (batch_size == 1) {
-        blaslt::Run(ctx,
-                    x_data,
-                    y_data,
-                    ctx.template Alloc<int32_t>(out),
-                    M,
-                    1,
-                    N,
-                    true,
-                    false,
-                    &matmul_planner);
-      } else {
-        blaslt::RunWithBatch(ctx,
-                             x_data,
-                             y_data,
-                             ctx.template Alloc<int32_t>(out),
-                             M,
-                             1,
-                             N,
-                             true,
-                             false,
-                             batch_size,
-                             M * N,
-                             0,
-                             M,
-                             &matmul_planner);
-      }
-    } else {
-      const int M = x.numel() / N;
-      blaslt::Run(ctx,
-                  x_data,
-                  y_data,
-                  ctx.template Alloc<int32_t>(out),
-                  M,
-                  1,
-                  N,
-                  false,
-                  false,
-                  &matmul_planner);
-    }
-    return true;
-  }
-
-  const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2];
-  const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1];
-  if (trans_y) {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 1],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 1,
-                                     K,
-                                     y_ndim - 1,
-                                     y_dims[y_ndim - 1]));
-  } else {
-    PADDLE_ENFORCE_EQ(
-        y_dims[y_ndim - 2],
-        K,
-        phi::errors::InvalidArgument("Input(Y) has error dim. "
-                                     "Y'dims[%d] must be equal to %d, "
-                                     "but received Y'dims[%d] is %d.",
-                                     y_ndim - 2,
-                                     K,
-                                     y_ndim - 2,
-                                     y_dims[y_ndim - 2]));
-  }
-  const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1];
-  const int ndim = (std::max)(x_ndim, y_ndim);
-  std::vector<std::int64_t> x_broadcast_dims(ndim);
-  std::vector<std::int64_t> y_broadcast_dims(ndim);
-  std::vector<std::int64_t> out_broadcast_dims(ndim);
-  GetBroadcastFromDims(x_ndim - 2,
-                       x_dims.data(),
-                       y_ndim - 2,
-                       y_dims.data(),
-                       x_broadcast_dims.data(),
-                       y_broadcast_dims.data(),
-                       out_broadcast_dims.data());
-  out_broadcast_dims[ndim - 2] = M;
-  out_broadcast_dims[ndim - 1] = N;
-
-  out->ResizeAndAllocate(common::make_ddim(out_broadcast_dims));
-  ctx.template Alloc<int32_t>(out);
-
-  const int batch_dim = ndim - 2;
-  // broadcast message
-  const bool is_broadcast_dims =
-      !std::equal(x_broadcast_dims.cbegin(),
-                  x_broadcast_dims.cbegin() + batch_dim,
-                  y_broadcast_dims.cbegin());
-
-  const std::int64_t x_batch_size =
-      std::accumulate(x_broadcast_dims.cbegin(),
-                      x_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t y_batch_size =
-      std::accumulate(y_broadcast_dims.cbegin(),
-                      y_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  const std::int64_t out_batch_size =
-      std::accumulate(out_broadcast_dims.cbegin(),
-                      out_broadcast_dims.cbegin() + batch_dim,
-                      1LL,
-                      std::multiplies<std::int64_t>());
-  if (out_batch_size == 0) return true;
-
-  if (x_batch_size == 1 && M == 1 && trans_y) {
-    if (!(K % 4 == 0)) {
-      return false;
-    }
-  } else if (!trans_x && !trans_y) {
-    if (!(N % 4 == 0 || N == 1) || !(K % 4 == 0) || (M == 1 && N == 1)) {
-      return false;
-    }
-  } else if (!trans_x && trans_y) {
-    if (!(K % 4 == 0)) {
-      return false;
-    }
-  } else if (trans_x && !trans_y) {
-    if (!(M % 4 == 0 || M == 1) || !(N % 4 == 0 || N == 1)) {
-      return false;
-    }
-  } else {
-    if (!(M % 4 == 0 || M == 1) || !(K % 4 == 0)) {
-      return false;
-    }
-  }
-  if (x_batch_size == 1 && y_batch_size == 1) {
-    blaslt::Run(ctx,
-                x_data,
-                y_data,
-                ctx.template Alloc<int32_t>(out),
-                M,
-                N,
-                K,
-                trans_x,
-                trans_y,
-                &matmul_planner);
-  } else if (x_batch_size == 1) {
-    if (M == 1 && trans_y) {
-      blaslt::Run(ctx,
-                  y_data,
-                  x_data,
-                  ctx.template Alloc<int32_t>(out),
-                  y_batch_size * N,
-                  1,
-                  K,
-                  false,
-                  false,
-                  &matmul_planner);
-    } else {
-      blaslt::RunWithBatch(ctx,
-                           x_data,
-                           y_data,
-                           ctx.template Alloc<int32_t>(out),
-                           M,
-                           N,
-                           K,
-                           trans_x,
-                           trans_y,
-                           out_batch_size,
-                           0,
-                           K * N,
-                           M * N,
-                           &matmul_planner);
-    }
-  } else if (y_batch_size == 1) {
-    if (!trans_x) {
-      blaslt::Run(ctx,
-                  x_data,
-                  y_data,
-                  ctx.template Alloc<int32_t>(out),
-                  x_batch_size * M,
-                  N,
-                  K,
-                  false,
-                  trans_y,
-                  &matmul_planner);
-    } else {
-      blaslt::RunWithBatch(ctx,
-                           x_data,
-                           y_data,
-                           ctx.template Alloc<int32_t>(out),
-                           M,
-                           N,
-                           K,
-                           true,
-                           trans_y,
-                           out_batch_size,
-                           M * K,
-                           0,
-                           M * N,
-                           &matmul_planner);
-    }
-  } else if (!is_broadcast_dims) {
-    blaslt::RunWithBatch(ctx,
-                         x_data,
-                         y_data,
-                         ctx.template Alloc<int32_t>(out),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         M * K,
-                         K * N,
-                         M * N,
-                         &matmul_planner);
-  } else {
-    // in the case, can't use stridedgemm
-    std::vector<const int8_t*> x_ptr(out_batch_size);
-    std::vector<const int8_t*> y_ptr(out_batch_size);
-    std::vector<int32_t*> out_ptr(out_batch_size);
-    std::vector<std::int64_t> index(batch_dim, 0);
-    for (std::int64_t i = 0; i < out_batch_size; ++i) {
-      // using the index to get offset
-      const std::int64_t x_index =
-          GetIndexMessage(batch_dim, x_broadcast_dims.data(), index.data());
-      const std::int64_t y_index =
-          GetIndexMessage(batch_dim, y_broadcast_dims.data(), index.data());
-
-      x_ptr[i] = x_data + x_index * M * K;
-      y_ptr[i] = y_data + y_index * K * N;
-      out_ptr[i] = ctx.template Alloc<int32_t>(out) + i * M * N;
-      IndexIncreaseFromDims(batch_dim, out_broadcast_dims.data(), index.data());
-    }
-    blaslt::RunWithBatch(ctx,
-                         x_ptr.data(),
-                         y_ptr.data(),
-                         out_ptr.data(),
-                         M,
-                         N,
-                         K,
-                         trans_x,
-                         trans_y,
-                         out_batch_size,
-                         &matmul_planner);
-  }
-  return true;
-#else
-  return false;
-#endif
-}
-#endif
-
-template <typename Context, typename T>
-typename std::enable_if<std::is_integral<T>::value>::type
-MatmulJudgeDtypeKernel(const Context& ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       const std::vector<std::int64_t>& x_dims,
-                       const std::vector<std::int64_t>& y_dims,
-                       DenseTensor* out,
-                       bool transpose_x,
-                       bool transpose_y) {
-  bool try_matmul_int8 = MatMulInt8Function<Context>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-  if (try_matmul_int8) {
-    return;
-  }
-  auto x_tmp = phi::Cast<T, Context>(ctx, x, phi::DataType::FLOAT32);
-  auto y_tmp = phi::Cast<T, Context>(ctx, y, phi::DataType::FLOAT32);
-  DenseTensor out_tmp;
-  MatMulFunction<Context, float>(
-      ctx, x_tmp, y_tmp, x_dims, y_dims, &out_tmp, transpose_x, transpose_y);
-  if (x.dtype() == phi::DataType::INT8) {
-    phi::CastKernel<float>(ctx, out_tmp, phi::DataType::INT32, out);
-    return;
-  }
-  phi::CastKernel<float>(ctx, out_tmp, x.dtype(), out);
-}
-
-template <typename Context, typename T>
-typename std::enable_if<!std::is_integral<T>::value>::type
-MatmulJudgeDtypeKernel(const Context& ctx,
-                       const DenseTensor& x,
-                       const DenseTensor& y,
-                       const std::vector<std::int64_t>& x_dims,
-                       const std::vector<std::int64_t>& y_dims,
-                       DenseTensor* out,
-                       bool transpose_x,
-                       bool transpose_y) {
-  MatMulFunction<Context, T>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-}
-
-template <typename T, typename Context>
-void MatmulKernel(const Context& ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& y,
-                  bool transpose_x,
-                  bool transpose_y,
-                  DenseTensor* out) {
-  PADDLE_ENFORCE_NE(
-      common::product(x.dims()),
-      0,
-      phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(
-      common::product(y.dims()),
-      0,
-      phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
-  const std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
-  const std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
-  MatmulJudgeDtypeKernel<Context, T>(
-      ctx, x, y, x_dims, y_dims, out, transpose_x, transpose_y);
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenKernelImpl(const Context& dev_ctx,
-                                 const DenseTensor& x,
-                                 const DenseTensor& y,
-                                 int x_num_col_dims,
-                                 int y_num_col_dims,
-                                 DenseTensor* out) {
-  const DenseTensor x_matrix =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  const DenseTensor y_matrix =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-
-  dev_ctx.template Alloc<T>(out);
-  auto z_dim = out->dims();
-  if (z_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-
-  auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
-
-  blas.MatMul(x_matrix, y_matrix, out);
-  if (z_dim.size() != 2) {
-    out->Resize(z_dim);
-  }
-}
-
-#ifdef PADDLE_WITH_CUDA
-
-template <typename Context>
-void MatmulWithFlattenKernelInt8Impl(const Context& dev_ctx,
-                                     const DenseTensor& x,
-                                     const DenseTensor& y,
-                                     int x_num_col_dims,
-                                     int y_num_col_dims,
-                                     DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(
-      x.dtype(),
-      DataType::INT8,
-      phi::errors::InvalidArgument(
-          "The type of input(x) used in int8 mul must be (%s) "
-          "does not match the "
-          "type of data (%s) currently contained in the container.",
-          phi::CppTypeToDataType<int8_t>::Type(),
-          x.dtype()));
-  PADDLE_ENFORCE_EQ(
-      y.dtype(),
-      DataType::INT8,
-      phi::errors::InvalidArgument(
-          "The type of input(y) used in int8 mul must be (%s) "
-          "does not match the "
-          "type of data (%s) currently contained in the container.",
-          phi::CppTypeToDataType<int8_t>::Type(),
-          y.dtype()));
-
-  const DenseTensor x_matrix =
-      x.dims().size() > 2 ? phi::ReshapeToMatrix(x, x_num_col_dims) : x;
-  const DenseTensor y_matrix =
-      y.dims().size() > 2 ? phi::ReshapeToMatrix(y, y_num_col_dims) : y;
-
-  PADDLE_ENFORCE_EQ(
-      x_matrix.dims()[1],
-      y_matrix.dims()[0],
-      phi::errors::InvalidArgument(
-          "X's numbers of columns must be equal to Y's numbers of rows."
-          "But received X has [%d] columns,"
-          "received Y has [%d] rows",
-          x_matrix.dims()[1],
-          y_matrix.dims()[0]));
-
-  PADDLE_ENFORCE_EQ((y_matrix.dims()[1] % 4 == 0 || y_matrix.dims()[1] == 1),
-                    true,
-                    phi::errors::InvalidArgument(
-                        "The dimension size N used in int8 mul must be 1"
-                        "or a multiple of 4 does not match the size (%d)"
-                        "currently contained in the container.",
-                        y_matrix.dims()[1]));
-  PADDLE_ENFORCE_EQ((x_matrix.dims()[1] % 4 == 0),
-                    true,
-                    phi::errors::InvalidArgument(
-                        "The dimension size K used in int8 mul must be a"
-                        "multiple of 4 does not match the size (%d) currently"
-                        "contained in the container.",
-                        x_matrix.dims()[1]));
-
-  dev_ctx.template Alloc<int32_t>(out);
-  auto z_dim = out->dims();
-  if (z_dim.size() != 2) {
-    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
-  }
-
-#if CUDA_VERSION >= 11060 && 0
-  using blaslt = phi::funcs::MatmulWithCublasLt<int8_t, int32_t>;
-
-  const int8_t* x_data = x_matrix.data<int8_t>();
-  const int8_t* y_data = y_matrix.data<int8_t>();
-
-  std::vector<std::int64_t> x_dims = {x_matrix.dims()[0], x_matrix.dims()[1]};
-  std::vector<std::int64_t> y_dims = {y_matrix.dims()[0], y_matrix.dims()[1]};
-  phi::funcs::MatmulPlanner matmul_planner(
-      x_dims,
-      y_dims,
-      false,
-      false,
-      phi::CppTypeToDataType<int8_t>::Type(),
-      funcs::MatmulFusedType::kMatmul,
-      /* bias_data */ nullptr,
-      /* reserve_data */ nullptr,
-      /* use_addto */ false,
-      /* no_exchange */ true);
-
-  blaslt::Run(dev_ctx,
-              x_data,
-              y_data,
-              dev_ctx.template Alloc<int32_t>(out),
-              x_matrix.dims()[0],
-              y_matrix.dims()[1],
-              x_matrix.dims()[1],
-              false,
-              false,
-              &matmul_planner);
-
-  if (z_dim.size() != 2) {
-    out->Resize(z_dim);
-  }
-#endif
-}
-#endif
-
-#ifdef PADDLE_WITH_CUDA
-template <typename Context>
-typename std::enable_if<std::is_same<Context, phi::GPUContext>::value,
-                        void>::type
-DispatchMatmulWithFlattenInt8Kernel(const phi::GPUContext& dev_ctx,
-                                    const DenseTensor& x,
-                                    const DenseTensor& y,
-                                    int x_num_col_dims,
-                                    int y_num_col_dims,
-                                    DenseTensor* out) {
-  MatmulWithFlattenKernelInt8Impl<Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-#endif
-
-template <typename Context>
-typename std::enable_if<std::is_same<Context, phi::CPUContext>::value,
-                        void>::type
-DispatchMatmulWithFlattenInt8Kernel(const phi::CPUContext& dev_ctx,
-                                    const DenseTensor& x,
-                                    const DenseTensor& y,
-                                    int x_num_col_dims,
-                                    int y_num_col_dims,
-                                    DenseTensor* out) {
-  PADDLE_THROW(phi::errors::Unimplemented(
-      "MatmulWithFlatten with CPU is NOT implemented "
-      "yet."));
-}
-
-template <typename T, typename Context>
-typename std::enable_if<std::is_same<T, int8_t>::value, void>::type
-DispatchMatmulFlattenKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            int x_num_col_dims,
-                            int y_num_col_dims,
-                            DenseTensor* out) {
-  DispatchMatmulWithFlattenInt8Kernel<Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-template <typename T, typename Context>
-typename std::enable_if<!std::is_same<T, int8_t>::value, void>::type
-DispatchMatmulFlattenKernel(const Context& dev_ctx,
-                            const DenseTensor& x,
-                            const DenseTensor& y,
-                            int x_num_col_dims,
-                            int y_num_col_dims,
-                            DenseTensor* out) {
-  MatmulWithFlattenKernelImpl<T, Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-template <typename T, typename Context>
-void MatmulWithFlattenKernel(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             int x_num_col_dims,
-                             int y_num_col_dims,
-                             DenseTensor* out) {
-  DispatchMatmulFlattenKernel<T, Context>(
-      dev_ctx, x, y, x_num_col_dims, y_num_col_dims, out);
-}
-
-}  // namespace phi
diff --git a/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h b/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h
index aaa7fbd8d2c..7ba97234cc1 100644
--- a/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/multi_dot_kernel_impl.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 namespace phi {
 
 template <typename Context, typename T>
diff --git a/backends/metax_gpu/kernels/impl/mv_kernel_impl.h b/backends/metax_gpu/kernels/impl/mv_kernel_impl.h
index a87d431e250..4baee25a099 100644
--- a/backends/metax_gpu/kernels/impl/mv_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/mv_kernel_impl.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/impl/solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/solve_grad_kernel_impl.h
index 860bce2cba5..1dd276dde2f 100644
--- a/backends/metax_gpu/kernels/impl/solve_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/solve_grad_kernel_impl.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/expand_as_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/matrix_solve.h"
 #include "paddle/phi/kernels/funcs/reduce_function.h"
diff --git a/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h
index 08138853099..ad656b7a6c8 100644
--- a/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/triangular_solve_grad_kernel_impl.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/complex_functors.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
index 51f8f6792e2..c31d82920b3 100644
--- a/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
@@ -14,10 +14,10 @@
 
 #include <string>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/metax_kernel/block_attn.h b/backends/metax_gpu/kernels/metax_kernel/block_attn.h
index 1e1eb2c0961..a5b88e34be1 100644
--- a/backends/metax_gpu/kernels/metax_kernel/block_attn.h
+++ b/backends/metax_gpu/kernels/metax_kernel/block_attn.h
@@ -14,11 +14,11 @@
 
 #pragma once
 
-#include "kernels/funcs/quant_dequant.h"
 #include "kernels/metax_kernel/mmha_util.cu.h"
 #include "paddle/common/flags.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
+#include "paddle/phi/kernels/funcs/quant_dequant.h"
 
 COMMON_DECLARE_bool(use_xqa_optim);
 COMMON_DECLARE_bool(blha_use_fp32_qk_sum);
diff --git a/backends/metax_gpu/kernels/metax_kernel/elementwise.h b/backends/metax_gpu/kernels/metax_kernel/elementwise.h
index 52a7709424b..b9f3d8af1c9 100644
--- a/backends/metax_gpu/kernels/metax_kernel/elementwise.h
+++ b/backends/metax_gpu/kernels/metax_kernel/elementwise.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 7386811a236..18f1e30f191 100644
--- a/backends/metax_gpu/kernels/metax_kernel/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -17,9 +17,9 @@
 #include <functional>
 #include <mutex>
 
-#include "kernels/funcs/blas/cublasLt.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/custom/custom_context.h"
+#include "paddle/phi/backends/dynload/cublasLt.h"
 #include "paddle/phi/backends/gpu/forwards.h"
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_helper.h"
@@ -28,8 +28,6 @@
 #include "paddle/phi/core/attribute.h"
 #include "paddle/phi/core/device_context.h"
 
-cublasLtHandle_t GetBlasLtHandle();
-
 namespace phi {
 class DnnWorkspaceHandle {
  public:
diff --git a/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
index 895484324a9..8cf069c0f4b 100644
--- a/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/mv_grad_kernel.h"
 
 namespace phi {
diff --git a/backends/metax_gpu/kernels/metax_kernel/quant_dequant.h b/backends/metax_gpu/kernels/metax_kernel/quant_dequant.h
index a37fc8c5c57..80d325530f5 100644
--- a/backends/metax_gpu/kernels/metax_kernel/quant_dequant.h
+++ b/backends/metax_gpu/kernels/metax_kernel/quant_dequant.h
@@ -16,12 +16,12 @@ limitations under the License. */
 
 #include <vector>
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/common/hostdevice.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/transform.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace phi {
 
diff --git a/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
index bee25a721fa..ba33e68aa5e 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
@@ -17,8 +17,8 @@
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
-// #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "kernels/funcs/blas/blas.h"
+// #include "paddle/phi/paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/rank_attention.cu.h"
 
diff --git a/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
index b6a4d2d76e9..eeb9c938888 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
@@ -17,8 +17,8 @@
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/kernel_registry.h"
-// #include "paddle/phi/kernels/funcs/blas/blas.h"
-#include "kernels/funcs/blas/blas.h"
+// #include "paddle/phi/paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/rank_attention.cu.h"
 
diff --git a/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
index de263c91c4d..3e9a5683ae4 100644
--- a/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
@@ -20,12 +20,12 @@
 #include <vector>
 
 #include "glog/logging.h"
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/determinant_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/impl/determinant_kernel_impl.h"
 #include "paddle/phi/kernels/slogdeterminant_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
index 5ff3211fe87..ed1ed259437 100644
--- a/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/funcs/blas/blas.h"
 #include "paddle/common/ddim.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/triangular_solve_kernel.h"
 
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 7ba32b5b399..70553934dfb 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -31,6 +31,56 @@ index bff0f2bf70..9376b5781f 100644
  #include "paddle/phi/core/os_info.h"
  #include "paddle/phi/core/platform/device/gpu/gpu_info.h"
  #include "paddle/phi/core/platform/profiler/utils.h"
+diff --git a/paddle/phi/backends/dynload/cublas.h b/paddle/phi/backends/dynload/cublas.h
+index 62beb53cfe..0b0ac09fc0 100644
+--- a/paddle/phi/backends/dynload/cublas.h
++++ b/paddle/phi/backends/dynload/cublas.h
+@@ -49,7 +49,12 @@ extern void *cublas_dso_handle;
+       std::call_once(cublas_dso_flag, []() {                                \
+         cublas_dso_handle = phi::dynload::GetCublasDsoHandle();             \
+       });                                                                   \
+-      static void *p_##__name = dlsym(cublas_dso_handle, #__name);          \
++      std::string replaced_name = #__name;                                  \
++      replaced_name = replaced_name.replace(0, 2, "mc");                    \
++      int index = replaced_name.find("_", 0);                               \
++      if (index != -1) replaced_name = replaced_name.substr(0, index);      \
++      static void* p_##__name =                                             \
++          dlsym(cublas_dso_handle, replaced_name.c_str());                  \
+       return reinterpret_cast<cublas_func>(p_##__name)(args...);            \
+     }                                                                       \
+   };                                                                        \
+diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h
+index 0527e743e7..247a844f18 100644
+--- a/paddle/phi/backends/dynload/cublasLt.h
++++ b/paddle/phi/backends/dynload/cublasLt.h
+@@ -46,12 +46,14 @@ extern void *cublasLt_dso_handle;
+       std::call_once(cublasLt_dso_flag, []() {                              \
+         cublasLt_dso_handle = phi::dynload::GetCublasLtDsoHandle();         \
+       });                                                                   \
+-      static void *p_##__name = dlsym(cublasLt_dso_handle, #__name);        \
++      std::string replaced_name = #__name;                                  \
++      replaced_name = replaced_name.replace(0, 2, "mc");                    \
++      static void* p_##__name =                                             \
++          dlsym(cublasLt_dso_handle, replaced_name.c_str());                \
+       return reinterpret_cast<cublasLt_func>(p_##__name)(args...);          \
+     }                                                                       \
+   };                                                                        \
+   extern DynLoad__##__name __name
+-
+ // APIs available after CUDA 11.1
+ #if CUDA_VERSION >= 11010
+ #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
+@@ -79,8 +81,8 @@ extern void *cublasLt_dso_handle;
+   __macro(cublasLtMatmulAlgoConfigGetAttribute);    \
+   __macro(cublasLtMatmulAlgoGetIds);                \
+   __macro(cublasLtMatmulAlgoCapGetAttribute);       \
+-  __macro(cublasLtMatmulAlgoCheck);                 \
+-  __macro(cublasLtGetCudartVersion);
++  __macro(cublasLtMatmulAlgoCheck);
++  // __macro(cublasLtGetCudartVersion);
+ #else
+ #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
+   __macro(cublasLtCreate);                       \
 diff --git a/paddle/phi/backends/dynload/cudnn.h b/paddle/phi/backends/dynload/cudnn.h
 index c0080f0a5e..458ca3e2e8 100644
 --- a/paddle/phi/backends/dynload/cudnn.h
@@ -210,6 +260,29 @@ index 8ec3cf2792..6f5460df00 100644
        return reinterpret_cast<Func>(p_##__name)(args...);            \
      }                                                                \
    };                                                                 \
+diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
+index 859f696896..87b5100a1b 100644
+--- a/paddle/phi/backends/dynload/dynamic_loader.cc
++++ b/paddle/phi/backends/dynload/dynamic_loader.cc
+@@ -18,7 +18,6 @@ limitations under the License. */
+ #include <cstdlib>
+ #include <string>
+ #include <vector>
+-#include "paddle/phi/backends/dynload/cupti_lib_path.h"
+ #include "paddle/phi/common/port.h"
+ #include "paddle/phi/core/enforce.h"
+ 
+@@ -108,6 +107,10 @@ COMMON_DECLARE_string(win_cuda_bin_dir);
+ #define SPARSELT_LIB_NAME "libcusparseLt.so"
+ #endif
+ 
++#ifndef CUPTI_LIB_PATH
++#define CUPTI_LIB_PATH "@CUPTI_LIBRARY_PATH@"
++#endif
++
+ #ifdef PADDLE_WITH_HIP
+ 
+ PHI_DEFINE_string(miopen_dir,
 diff --git a/paddle/phi/backends/dynload/nvjpeg.h b/paddle/phi/backends/dynload/nvjpeg.h
 index c5309e7e11..3328571380 100644
 --- a/paddle/phi/backends/dynload/nvjpeg.h
@@ -346,21 +419,10 @@ index 4ff2e528a9..23f7f4b583 100644
  
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
-index 024a7de73e..1e4cdf16be 100644
+index 024a7de73e..66b373d698 100644
 --- a/paddle/phi/core/enforce.h
 +++ b/paddle/phi/core/enforce.h
-@@ -45,7 +45,9 @@ limitations under the License. */
- #endif
- 
- #ifdef PADDLE_WITH_CUDA
--#include "paddle/phi/backends/dynload/cublas.h"
-+// #include "paddle/phi/backends/dynload/../../../../../cublas.h"
-+#include "../backends/metax_gpu/kernels/funcs/blas/cublas.h"
-+// #include "paddle/phi/backends/dynload/cublas.h"
- #include "paddle/phi/backends/dynload/cudnn.h"
- #include "paddle/phi/backends/dynload/curand.h"
- #include "paddle/phi/backends/dynload/cusolver.h"
-@@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; }
+@@ -97,7 +97,7 @@ inline bool is_error(bool stat) { return !stat; }
  
  void ThrowWarnInternal(const std::string& message);
  
@@ -369,75 +431,68 @@ index 024a7de73e..1e4cdf16be 100644
  // For cuda, the assertions can affect performance and it is therefore
  // recommended to disable them in production code
  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
-@@ -109,7 +111,7 @@ void ThrowWarnInternal(const std::string& message);
+@@ -109,7 +109,7 @@ void ThrowWarnInternal(const std::string& message);
               __LINE__,                                             \
               #_IS_NOT_ERROR,                                       \
               ##__VA_ARGS__);                                       \
 -      asm("trap;");                                                \
-+      __builtin_trap();                                             \
++      __builtin_trap();                                            \
      }                                                              \
    } while (0)
  #elif defined(__HIPCC__)
-@@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) {
- 
- }  // namespace enforce
- using namespace enforce;  // NOLINT
--}  // namespace phi
-+}  // namespace phi
-\ No newline at end of file
-diff --git a/paddle/phi/core/platform/device/gpu/gpu_types.h b/paddle/phi/core/platform/device/gpu/gpu_types.h
-index c646e487d0..325122175c 100644
---- a/paddle/phi/core/platform/device/gpu/gpu_types.h
-+++ b/paddle/phi/core/platform/device/gpu/gpu_types.h
-@@ -25,8 +25,9 @@
- #else
- #include <cuda_runtime.h>
- 
--#include "paddle/phi/backends/dynload/cublas.h"
--#include "paddle/phi/backends/dynload/cublasLt.h"
-+// #include "paddle/phi/backends/dynload/cublas.h"
-+#include "kernels/funcs/blas/cublas.h"
-+// #include "paddle/phi/backends/dynload/cublasLt.h"
- #include "paddle/phi/backends/dynload/cudnn.h"
- #endif
- 
-@@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
- 
- // TODO(Ming Huang): Since there is no blasLt handler,
- // use rocblas_handle for workaround.
--DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
-+// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
- 
- #undef DECLARE_TYPE_FOR_GPU
- 
-diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h
-index 2d02eb370b..8a7233e34e 100644
---- a/paddle/phi/core/platform/device_context.h
-+++ b/paddle/phi/core/platform/device_context.h
-@@ -25,8 +25,8 @@ limitations under the License. */
- #include "paddle/phi/core/platform/device/gpu/gpu_types.h"
- #include "paddle/phi/core/platform/device_type.h"
- #ifdef PADDLE_WITH_CUDA
--#include "paddle/phi/backends/dynload/cublas.h"
--#include "paddle/phi/backends/dynload/cublasLt.h"
-+#include "kernels/funcs/blas/cublas.h"
-+#include "kernels/funcs/blas/cublasLt.h"
- #include "paddle/phi/backends/dynload/cudnn.h"
- #include "paddle/phi/backends/dynload/cusolver.h"
- #include "paddle/phi/backends/dynload/cusparse.h"
-diff --git a/paddle/phi/kernels/cpu/index_select_impl.h b/paddle/phi/kernels/cpu/index_select_impl.h
-index d69eb67d6f..1d8b6e9375 100644
---- a/paddle/phi/kernels/cpu/index_select_impl.h
-+++ b/paddle/phi/kernels/cpu/index_select_impl.h
-@@ -18,7 +18,7 @@
+diff --git a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
+index e63b3d2f6e..95d7e6f204 100644
+--- a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
++++ b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
+@@ -628,7 +628,13 @@ class CublasLtAlgoCache {
+     infile >> cublaslt_version;
+     VLOG(1) << "cublaslt_version " << cublaslt_version;
+ 
+-    if (dynload::cublasLtGetCudartVersion() != cublaslt_version) {
++    // if (dynload::cublasLtGetCudartVersion() != cublaslt_version) {
++    //   LOG(INFO) << algo_caches_file_
++    //             << " is not compatible with current cublaslt_version "
++    //             << real_cublaslt_version;
++    //   return;
++    // }
++    if (3000 != cublaslt_version) {
+       LOG(INFO) << algo_caches_file_
+                 << " is not compatible with current cublaslt_version "
+                 << real_cublaslt_version;
+@@ -655,7 +661,8 @@ class CublasLtAlgoCache {
+       if (dev == 0) {
+         std::ofstream outfile;
+         outfile.open(algo_caches_file_, std::ios::out | std::ios::trunc);
+-        outfile << dynload::cublasLtGetCudartVersion() << std::endl;
++        // outfile << dynload::cublasLtGetCudartVersion() << std::endl;
++        outfile << 3000 << std::endl;
+ 
+         for (const auto& [seed, algo] : algo_caches_) {
+           outfile << seed << " ";
+diff --git a/paddle/phi/kernels/funcs/cublaslt.h b/paddle/phi/kernels/funcs/cublaslt.h
+index e7e1dd2370..583c7d6474 100644
+--- a/paddle/phi/kernels/funcs/cublaslt.h
++++ b/paddle/phi/kernels/funcs/cublaslt.h
+@@ -42,19 +42,11 @@ class CublasLtHelper {
+   CublasLtHelper(int m, int k, int n, cublasLtHandle_t handle)
+       : handle_(handle), alpha_(1), beta_(0), m_(m), k_(k), n_(n) {
+     cublasStatus_t status;
+-#if CUBLAS_VER_MAJOR < 11
+-    cudaDataType_t cudaComputeType = CUDA_R_32I;
+-#else
+     cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
+-#endif
  
- #include "paddle/phi/core/dense_tensor.h"
- #include "paddle/phi/core/tensor_utils.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- #include "paddle/phi/kernels/funcs/math_function.h"
+     // matmul desc
+-#if CUBLAS_VER_MAJOR < 11
+-    status = dyl::cublasLtMatmulDescCreate(&matmul_desc_, cudaComputeType);
+-#else
+     status = dyl::cublasLtMatmulDescCreate(
+         &matmul_desc_, cudaComputeType, CUDA_R_32I);
+-#endif
  
+     PADDLE_ENFORCE_EQ(
+         status,
 diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h
 index 461e6e2474..48a64ae9ce 100644
 --- a/paddle/phi/kernels/funcs/embedding_grad.h
@@ -453,38 +508,6 @@ index 461e6e2474..48a64ae9ce 100644
  #endif
    dim3 threads(kWarpSize, kBlockDimY);
    dim3 grids(static_cast<int>((D + kWarpSize - 1) / kWarpSize));
-diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
-index cb35feee32..64f5bd24ac 100644
---- a/paddle/phi/kernels/funcs/fc_functor.cu
-+++ b/paddle/phi/kernels/funcs/fc_functor.cu
-@@ -16,12 +16,12 @@ limitations under the License. */
- 
- #include "paddle/phi/backends/all_context.h"
- #include "paddle/phi/kernels/funcs/aligned_vector.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/fc_functor.h"
- 
- #include "paddle/phi/backends/gpu/gpu_launch_config.h"
- #include "paddle/phi/core/dense_tensor.h"
--#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
-+// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
- #include "paddle/phi/kernels/funcs/quant_dequant.h"
- #include "paddle/phi/kernels/matmul_kernel.h"
- 
-diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
-index 88663ec880..98b93072a3 100644
---- a/paddle/phi/kernels/funcs/gru_compute.cu
-+++ b/paddle/phi/kernels/funcs/gru_compute.cu
-@@ -12,7 +12,7 @@ limitations under the License. */
- #include "paddle/phi/kernels/funcs/gru_compute.h"
- 
- #include "paddle/phi/backends/gpu/gpu_context.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
- #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
- 
 diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
 index 4eae698648..5c047723ea 100644
 --- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h
@@ -503,19 +526,6 @@ index 4eae698648..5c047723ea 100644
  #endif
    return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize;
  }
-diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
-index 15e1a4a3c3..e4780538d7 100644
---- a/paddle/phi/kernels/funcs/math/context_project.h
-+++ b/paddle/phi/kernels/funcs/math/context_project.h
-@@ -18,7 +18,7 @@
- #include <vector>
- 
- #include "paddle/phi/core/tensor_utils.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/im2col.h"
- 
- namespace phi {
 diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h
 index e5361b836e..5ad238df08 100644
 --- a/paddle/phi/kernels/funcs/math_cuda_utils.h
@@ -559,51 +569,20 @@ index e5361b836e..5ad238df08 100644
    return val;
  }
  
-diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
-index e101224970..a52eb6096f 100644
---- a/paddle/phi/kernels/funcs/matrix_inverse.cu
-+++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
-@@ -15,11 +15,13 @@ limitations under the License. */
- #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
- #include "paddle/phi/common/memory_utils.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- 
+diff --git a/paddle/phi/kernels/funcs/quant_dequant.h b/paddle/phi/kernels/funcs/quant_dequant.h
+index 8f0736f64e..f11c29a6ef 100644
+--- a/paddle/phi/kernels/funcs/quant_dequant.h
++++ b/paddle/phi/kernels/funcs/quant_dequant.h
+@@ -19,9 +19,7 @@ limitations under the License. */
+ #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+ #include "paddle/phi/common/transform.h"
+ #include "paddle/phi/kernels/funcs/aligned_vector.h"
+-#ifndef PADDLE_WITH_CUSTOM_DEVICE
+ #include "paddle/phi/kernels/funcs/blas/blas.h"
+-#endif
  namespace phi {
- namespace funcs {
- 
-+
-+
- template <typename Context, typename T>
- void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
-                                                   const DenseTensor& a,
-diff --git a/paddle/phi/kernels/funcs/matrix_solve.cu b/paddle/phi/kernels/funcs/matrix_solve.cu
-index 558d363b39..05da04b517 100644
---- a/paddle/phi/kernels/funcs/matrix_solve.cu
-+++ b/paddle/phi/kernels/funcs/matrix_solve.cu
-@@ -16,7 +16,7 @@ limitations under the License. */
- #include "paddle/phi/backends/gpu/cuda/cudnn_workspace_helper.h"
- #include "paddle/phi/common/memory_utils.h"
- #include "paddle/phi/core/tensor_utils.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/math_function.h"
- #include "paddle/phi/kernels/funcs/scatter.cu.h"
- 
-diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-index 047f52bd91..a05b34d3ba 100644
---- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-+++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
-@@ -27,7 +27,7 @@ namespace cub = hipcub;
- 
- #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
- 
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
  
- namespace phi {
+ using backends::gpu::GpuLaunchConfig;
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..108edda7ca 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -873,31 +852,17 @@ index e30d440ff3..108edda7ca 100644
  }  // namespace funcs
  }  // namespace phi
 +//
-diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
-index 32db61532f..0220316bc3 100644
---- a/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
-+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
-@@ -15,7 +15,7 @@
- #pragma once
- 
- #if defined(PADDLE_WITH_CUDA)
--#include "paddle/phi/backends/dynload/cublasLt.h"
-+// #include "paddle/phi/backends/dynload/cublasLt.h"
- #endif
- 
- #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
-index 9d4bb18d55..ea42cc10a9 100644
+index 9d4bb18d55..80405c2b78 100644
 --- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 +++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
-@@ -638,9 +638,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
+@@ -638,9 +638,6 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
          RandVec<VecSize>(&state, rand);
  #pragma unroll
          for (int jt = 0; jt < VecSize; jt++) {
 -#ifndef PADDLE_WITH_HIP
 -#pragma unroll
 -#endif
-+// #pragma unroll
            mask_vec[it][jt] = static_cast<MaskType>(rand[jt] >= dropout_prob);
          }
        }
@@ -942,19 +907,6 @@ index f0cca0f701..02ea957240 100644
  
  namespace phi {
  // To determine use cudnn or not.
-diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu
-index af27ac89ab..ee0edc6b8e 100644
---- a/paddle/phi/kernels/gpu/dot_kernel.cu
-+++ b/paddle/phi/kernels/gpu/dot_kernel.cu
-@@ -15,7 +15,7 @@
- #include "paddle/phi/kernels/dot_kernel.h"
- #include "paddle/phi/backends/gpu/gpu_context.h"
- #include "paddle/phi/core/kernel_registry.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- 
- #include "paddle/phi/kernels/full_kernel.h"
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
 index 29fa252e96..4ae72b0935 100644
 --- a/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -1019,84 +971,6 @@ index 1bdbe1564c..f753b54bc6 100644
  #include "paddle/phi/kernels/impl/qr_kernel_impl.h"
  #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h"
  #include "paddle/phi/kernels/lstsq_kernel.h"
-diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
-index 9bc5326c90..79b57a8203 100644
---- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h
-@@ -21,7 +21,7 @@ limitations under the License. */
- #include "paddle/phi/common/amp_type_traits.h"
- #include "paddle/phi/kernels/addmm_grad_kernel.h"
- #include "paddle/phi/kernels/full_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
- #include "paddle/phi/kernels/funcs/for_range.h"
-diff --git a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
-index cf80666b4e..ca76e055fb 100644
---- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
-@@ -19,7 +19,7 @@ limitations under the License. */
- 
- #include "paddle/phi/common/amp_type_traits.h"
- #include "paddle/phi/kernels/baddbmm_grad_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
- #include "paddle/phi/kernels/funcs/for_range.h"
-diff --git a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
-index 2789cb59a2..b91b076f7f 100644
---- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
-@@ -20,7 +20,7 @@ limitations under the License. */
- 
- #include "paddle/phi/common/amp_type_traits.h"
- #include "paddle/phi/kernels/baddbmm_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
- 
-diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
-index 9a21c23666..86413d1577 100644
---- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
-@@ -19,7 +19,7 @@
- #include "paddle/phi/kernels/conv_transpose_grad_kernel.h"
- #include "paddle/phi/kernels/cpu/conv_util.h"
- #include "paddle/phi/kernels/full_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
- #include "paddle/phi/kernels/funcs/im2col.h"
- #include "paddle/phi/kernels/funcs/slice.h"
-diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
-index 4459a931da..837c8682b8 100644
---- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
-@@ -18,7 +18,7 @@
- #include "paddle/phi/core/dense_tensor.h"
- #include "paddle/phi/kernels/empty_kernel.h"
- #include "paddle/phi/kernels/full_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
- 
- namespace phi {
-diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
-index ad9e9197dd..5478d9817d 100644
---- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
-@@ -18,7 +18,7 @@
- #include "paddle/phi/core/dense_tensor.h"
- #include "paddle/phi/kernels/empty_kernel.h"
- #include "paddle/phi/kernels/full_kernel.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
- #include "paddle/phi/kernels/transpose_kernel.h"
- #include "paddle/utils/optional.h"
 diff --git a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 index e6b3960f6d..564125f1f6 100644
 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
@@ -1112,80 +986,3 @@ index e6b3960f6d..564125f1f6 100644
  
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
  
-diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
-index 410fb3c560..009ce03440 100644
---- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
-@@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
- 
- template <typename T>
- HOSTDEVICE T digamma(T x) {
--  static T pi = T{3.14159265358979323846};
-+  const static T pi = T{3.14159265358979323846};
- 
-   if (x == T{0.0}) {
-     T inf = std::numeric_limits<T>::infinity();
-diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
-index 5ebbc8d2db..c7b6c338e2 100644
---- a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
-@@ -15,8 +15,9 @@ limitations under the License. */
- #include <iostream>
- #include <vector>
- #include "paddle/phi/common/datatype_traits.h"
--#include "paddle/phi/kernels/funcs/cublaslt.h"
--#include "paddle/phi/kernels/funcs/quant_dequant.h"
-+#include "kernels/funcs/blas/cublaslt.h"
-+#include "kernels/funcs/quant_dequant.h"
-+#include "kernels/metax_kernel/metax_context.h"
- 
- #pragma once
- 
-@@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx,
- 
-   {
-     auto helper =
--        std::make_unique<CublasLtHelper>(m, k, n, dev_ctx.cublaslt_handle());
-+        std::make_unique<CublasLtHelper>(m, k, n, GetBlasLtHandle());
-     helper->GEMM(quant_input.data<int8_t>(),
-                  weight->data<int8_t>(),
-                  int_out.data<int32_t>(),
-diff --git a/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
-index 1f319c4ae3..9186eb6906 100644
---- a/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
-@@ -15,7 +15,7 @@ limitations under the License. */
- #pragma once
- 
- #include "paddle/phi/core/dense_tensor.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
- namespace phi {
-diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
-index 6f03f76eeb..5fe2c3e7dc 100644
---- a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
-@@ -15,7 +15,7 @@ limitations under the License. */
- #pragma once
- 
- #include "paddle/phi/core/dense_tensor.h"
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/for_range.h"
- #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
-diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
-index 4099d8b506..baef2cd643 100644
---- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
-+++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
-@@ -14,7 +14,7 @@
- 
- #pragma once
- 
--#include "paddle/phi/kernels/funcs/blas/blas.h"
-+#include "kernels/funcs/blas/blas.h"
- #include "paddle/phi/kernels/funcs/eigen/common.h"
- #include "paddle/phi/kernels/funcs/math_function.h"
- 
diff --git a/backends/metax_gpu/runtime/runtime.cc b/backends/metax_gpu/runtime/runtime.cc
index 36fbd88c2ea..edbe937e7ba 100644
--- a/backends/metax_gpu/runtime/runtime.cc
+++ b/backends/metax_gpu/runtime/runtime.cc
@@ -36,12 +36,12 @@
 #include <unordered_map>
 
 #include "glog/logging.h"
-#include "kernels/funcs/blas/cublasLt.h"
 #include "paddle/fluid/platform/profiler/cuda_tracer.h"
 #include "paddle/fluid/platform/profiler/cupti_data_process.h"
 #include "paddle/phi/api/profiler/trace_event_collector.h"
 #include "paddle/phi/backends/device_base.h"
 #include "paddle/phi/backends/device_ext.h"
+#include "paddle/phi/backends/dynload/cublasLt.h"
 #include "paddle/phi/backends/dynload/cupti.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"

From f3b6cc45ed5726520e25fc3d65a75ad34168ac40 Mon Sep 17 00:00:00 2001
From: ZhouDuan <136539532+1184319564@users.noreply.github.com>
Date: Mon, 20 Oct 2025 17:06:44 +0800
Subject: [PATCH 81/95] fix activation_grad kernel (#118)

* [Metax_change_ut]

* fix sum&collect_fpn_proposals op register

* modify profile

* [Metax] fix paddle bug replace 'MoeGradDispatchKernel' to 'MoeGateDispatchKernel'

* [Metax] register bce_loss_grad & bce_loss & index_add_grad kernels

* [Metax] con2d_grad use gpudnn

* blas handle support

* [Metax] register some kernels & update CMakeLists

* [Metax] fix metax unittest fail

* [Metax] add group_norm & label_smooth kernel and update matmul kernel

* [Metax] fix rmsprop kernel register and add meshgrid & meshgrid_grad kernel register

* add test

* add test

* [test]  chang the logic of workspace_host in cholesky_kernel_register

alloc(cpuplace,size), test pass
alloc(cpuplace, size, stream), crash

* [Metax] fix compile fail

* Revert "[Metax] fix compile fail"

This reverts commit 83bc87f686227962b0262e044225c6ed5507b824.

* [Metax] fix compile fail by 'conv_transpose_grad_kernel_impl.h'

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] con2d_grad use gpudnn

* [Metax]fix bug and add qr lstsq logsoftmax

* [Metax] change_patch

* [Metax] update unit test CMakeLists.txt

* [Metax] update unit test CMakeLists.txt

* [feature] add unique_consecutive kernel

* [metax] add some kernel

* [metax] add some kernel

* [Metax] register baddbmm kernel & update blas api

* [Metax] register baddbmm kernel & update blas api

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [feature]  add add unique_consecutive kernel.cu

* [fix] fix some test case due to missing op register

* [fix]  fix some fail text

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax]fix lu eigvalshsqueeze rnn kernel

* add and fix some kernels

* [Metax] register deformable_conv kernel & fix 'ModulatedDeformableCol2imCoord' symbol undefined

* [Metax] fix conflict

* [Metax] adapt to paddle-cpu-20250901 & resolve the issue of 'test_elementwise_mul_op_metax' failure

* [Metax] update repeat_interleave kernel & ignore max op test

* [metax]fix lu eigvalshsqueeze rnn kernel

* [metax] chang patch fix copy

* [metax] chang patch fix copy

* [Metax] update metax_gpu unit test

* [Metax] fix test CMakeList.txt

* fix some tests

* add one test

* fix one kernel

---------

Co-authored-by: sw <1640472053@qq.com>
Co-authored-by: duqimeng <77875733+duqimeng@users.noreply.github.com>
Co-authored-by: Mingkun.Zhang <2496808993@qq.com>
Co-authored-by: metax666 <metax_pde@outlook.com>
Co-authored-by: jiaxinWang-metax <189149612@qq.com>
Co-authored-by: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Co-authored-by: chezhang <1376507468@qq.com>
Co-authored-by: zhang-chenyi <74278535+zhang-chenyi@users.noreply.github.com>
---
 .../activation_grad_kernel_register.cu        | 166 ++++++++++--------
 1 file changed, 91 insertions(+), 75 deletions(-)

diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
index 6c46ef10c0f..d49e74dea73 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
@@ -15,8 +15,6 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/activation_grad_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
@@ -119,6 +117,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
     ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, &x, nullptr, &dout, dx, functor);              \
   }
+
 #define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_DOUBLE_ATTRS_DEPX(  \
     name, functor_class, attr1, attr2)                          \
   template <typename T, typename Context>                       \
@@ -135,6 +134,7 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
     ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, &x, nullptr, &dout, dx, functor);              \
   }
+
 #define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
   template <typename T, typename Context>                             \
   void name##GradKernel(const Context& dev_ctx,                       \
@@ -161,6 +161,21 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
         dev_ctx, nullptr, &out, &dout, dx, functor);            \
   }
 
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT( \
+    name, functor_class, attr)                                   \
+  template <typename T, typename Context>                        \
+  void name##GradKernel(const Context& dev_ctx,                  \
+                        const DenseTensor& out,                  \
+                        const DenseTensor& dout,                 \
+                        double attr,                             \
+                        DenseTensor* dx) {                       \
+    funcs::functor_class<T> functor;                             \
+    auto attrs = functor.GetAttrs();                             \
+    *(attrs[0].second) = attr;                                   \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>(  \
+        dev_ctx, nullptr, &out, &dout, dx, functor);             \
+  }
+
 #define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(       \
     name, functor_class, attr1, attr2)                          \
   template <typename T, typename Context>                       \
@@ -240,9 +255,9 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Celu,
                                                CudaCELUGradFunctor,
                                                alpha);
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT(LogitCUDA,
-                                                 CudaLogitGradFunctor,
-                                                 eps);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPOUT(LogitCUDA,
+                                                        CudaLogitGradFunctor,
+                                                        eps);
 
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(HardTanh,
                                                CudaHardTanhGradFunctor,
@@ -266,6 +281,7 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(ThresholdedRelu,
                                                CudaThresholdedReluGradFunctor,
                                                threshold,
                                                value);
+
 template <typename T, typename Context>
 void SiluGradKernel(const Context& dev_ctx,
                     const DenseTensor& x,
@@ -390,14 +406,14 @@ PD_CUSTOM_KERNEL_REGISTER(relu_grad,
                           phi::ReluGradKernel,
                           float,
                           double,
-                          phi::dtype::float16) {}
+                          phi::float16) {}
 PD_CUSTOM_KERNEL_REGISTER(relu_double_grad,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::ReluDoubleGradKernel,
                           float,
                           double,
-                          phi::dtype::float16) {}
+                          phi::float16) {}
 #else
 PD_CUSTOM_KERNEL_REGISTER(relu_grad,
                           metax_gpu,
@@ -405,16 +421,16 @@ PD_CUSTOM_KERNEL_REGISTER(relu_grad,
                           phi::ReluGradKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 PD_CUSTOM_KERNEL_REGISTER(relu_double_grad,
                           metax_gpu,
                           ALL_LAYOUT,
                           phi::ReluDoubleGradKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 #endif
 
 #define PD_REGISTER_ACTIVATION_GRAD_KERNEL(name, func) \
@@ -424,8 +440,8 @@ PD_CUSTOM_KERNEL_REGISTER(relu_double_grad,
                             phi::func,                 \
                             float,                     \
                             double,                    \
-                            phi::dtype::float16,       \
-                            phi::dtype::bfloat16) {}
+                            phi::float16,              \
+                            phi::bfloat16) {}
 
 #define PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(name, func) \
   PD_CUSTOM_KERNEL_REGISTER(name,                                   \
@@ -434,10 +450,10 @@ PD_CUSTOM_KERNEL_REGISTER(relu_double_grad,
                             phi::func,                              \
                             float,                                  \
                             double,                                 \
-                            phi::dtype::float16,                    \
-                            phi::dtype::bfloat16,                   \
-                            phi::dtype::complex<float>,             \
-                            phi::dtype::complex<double>) {}
+                            phi::float16,                           \
+                            phi::bfloat16,                          \
+                            phi::complex64,                         \
+                            phi::complex128) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(sin_grad, SinGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(cos_grad, CosGradKernel)
@@ -483,10 +499,10 @@ PD_CUSTOM_KERNEL_REGISTER(exp_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(softshrink_grad, SoftShrinkGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_shrink_grad, HardShrinkGradKernel)
@@ -502,10 +518,10 @@ PD_CUSTOM_KERNEL_REGISTER(expm1_grad,
                           phi::Expm1GradKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(square_grad,
                           metax_gpu,
@@ -515,10 +531,10 @@ PD_CUSTOM_KERNEL_REGISTER(square_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(square_double_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -527,10 +543,10 @@ PD_CUSTOM_KERNEL_REGISTER(square_double_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(sin_double_grad,
                           metax_gpu,
@@ -540,10 +556,10 @@ PD_CUSTOM_KERNEL_REGISTER(sin_double_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(sin_triple_grad,
                           metax_gpu,
@@ -553,10 +569,10 @@ PD_CUSTOM_KERNEL_REGISTER(sin_triple_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(cos_double_grad,
                           metax_gpu,
@@ -566,10 +582,10 @@ PD_CUSTOM_KERNEL_REGISTER(cos_double_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_CUSTOM_KERNEL_REGISTER(cos_triple_grad,
                           metax_gpu,
@@ -579,10 +595,10 @@ PD_CUSTOM_KERNEL_REGISTER(cos_triple_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(softsign_grad,
                                                 SoftsignGradKernel)
@@ -604,10 +620,10 @@ PD_CUSTOM_KERNEL_REGISTER(log_double_grad,
                           phi::LogDoubleGradKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_REGISTER_ACTIVATION_GRAD_KERNEL_WITH_COMPLEX(hardswish_grad,
                                                 HardSwishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(swish_grad, SwishGradKernel)
@@ -622,8 +638,8 @@ PD_CUSTOM_KERNEL_REGISTER(rint_grad,
                           int64_t,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 PD_CUSTOM_KERNEL_REGISTER(round_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -632,10 +648,10 @@ PD_CUSTOM_KERNEL_REGISTER(round_grad,
                           int64_t,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(pow_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -644,10 +660,10 @@ PD_CUSTOM_KERNEL_REGISTER(pow_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(pow_double_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -656,10 +672,10 @@ PD_CUSTOM_KERNEL_REGISTER(pow_double_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(pow_triple_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -668,10 +684,10 @@ PD_CUSTOM_KERNEL_REGISTER(pow_triple_grad,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(ceil_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -683,8 +699,8 @@ PD_CUSTOM_KERNEL_REGISTER(ceil_grad,
                           int16_t,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 PD_CUSTOM_KERNEL_REGISTER(floor_grad,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -696,5 +712,5 @@ PD_CUSTOM_KERNEL_REGISTER(floor_grad,
                           int16_t,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}

From c2bb7099311feb00cfc03050bf02565e89461aa9 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 21 Oct 2025 15:07:06 +0800
Subject: [PATCH 82/95] updata flag_and_fix_activation

* updata flag_and_fix_activation

* updataignore

---------
---
 backends/metax_gpu/common/flags_declare.cc    |  21 +++
 .../activation_grad_kernel_register.cu        |  21 ++-
 .../activation_kernel_register.cu             | 133 ++++++++++--------
 .../kernels/metax_kernel/mmha_util.cu.h       |  10 +-
 backends/metax_gpu/tests/ignore.txt           |   6 +-
 5 files changed, 119 insertions(+), 72 deletions(-)

diff --git a/backends/metax_gpu/common/flags_declare.cc b/backends/metax_gpu/common/flags_declare.cc
index 6b497cf9fdf..fb656878033 100644
--- a/backends/metax_gpu/common/flags_declare.cc
+++ b/backends/metax_gpu/common/flags_declare.cc
@@ -37,6 +37,27 @@
  */
 
 static constexpr int kDefaultConvWorkspaceSizeLimitMB = 512;
+/**
+ * CUDA related FLAG
+ * Name: FLAGS_cublaslt_exhaustive_search_times
+ * Since Version: 2.3.0
+ * Value Range: int64_t, default=0
+ * Example:
+ * Note: Represents times of exhaustive search to evaluate performance of
+ *       cuBlasLt matmul algorithm (with/without epilogue). Set this flag
+ *       with value > 0 to enable exhaustive search. Default is 0, means
+ *       getting algorithms via heuristic search. There are two search methods
+ *       in cuBlasLt, heuristic search and exhaustive search. Exhaustive search
+ *       attempts all cuBlasLt algorithms to select the fastest, which is very
+ *       time-consuming, and the selected algorithm will be cached for a given
+ *       layer specification Once you change the layer specifications
+ *       (such as M, N and K), it will re-search again.
+ */
+PHI_DEFINE_EXPORTED_int64(
+    cublaslt_exhaustive_search_times,
+    0,
+    "The times of exhaustive search for cuBlasLt matmul with/without "
+    " epilogue algorithms, default is 0, means disabling exhaustive search.");
 
 PHI_DEFINE_EXPORTED_bool(
     cudnn_exhaustive_search,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
index d49e74dea73..f5ee4ec25f8 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
@@ -101,6 +101,21 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
         dev_ctx, &x, nullptr, &dout, dx, functor);              \
   }
 
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPX(  \
+    name, functor_class, attr)                                  \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        double attr,                            \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr;                                  \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
+
 #define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(         \
     name, functor_class, attr1, attr2)                          \
   template <typename T, typename Context>                       \
@@ -239,9 +254,9 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, CudaLog10GradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, CudaLog1pGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Swish, CudaSwishGradFunctor);
 
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
-                                               CudaLeakyReluGradFunctor,
-                                               alpha);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_DOUBLE_ATTRS_DEPX(LeakyRelu,
+                                                      CudaLeakyReluGradFunctor,
+                                                      alpha);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(SoftShrink,
                                                CudaSoftShrinkGradFunctor,
                                                lambda);
diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
index 363932cfc28..d91e4afd25e 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
-#include "paddle/phi/common/bfloat16.h"
-#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
@@ -75,6 +73,19 @@ void ActivationGPUImpl(const Context& dev_ctx,
         dev_ctx, x, out, functor);                                      \
   }
 
+#define DEFINE_GPU_ACT_KERNEL_WITH_ONE_DOUBLE_ATTRS(name, functor_class, attr) \
+  template <typename T, typename Context>                                      \
+  void name##Kernel(const Context& dev_ctx,                                    \
+                    const DenseTensor& x,                                      \
+                    double attr,                                               \
+                    DenseTensor* out) {                                        \
+    funcs::functor_class<T> functor;                                           \
+    auto attrs = functor.GetAttrs();                                           \
+    *(attrs[0].second) = attr;                                                 \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>(                    \
+        dev_ctx, x, out, functor);                                             \
+  }
+
 #define DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(               \
     name, functor_class, attr1, attr2)                      \
   template <typename T, typename Context>                   \
@@ -90,6 +101,7 @@ void ActivationGPUImpl(const Context& dev_ctx,
     ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, x, out, functor);                          \
   }
+
 #define DEFINE_GPU_ACT_KERNEL_WITH_TWO_DOUBLE_ATTRS(        \
     name, functor_class, attr1, attr2)                      \
   template <typename T, typename Context>                   \
@@ -105,6 +117,7 @@ void ActivationGPUImpl(const Context& dev_ctx,
     ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, x, out, functor);                          \
   }
+
 DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
@@ -138,8 +151,10 @@ DEFINE_GPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Log1p, CudaLog1pFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Exp, CudaExpFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL_WITH_INT_IN_FLOAT_OUT(Expm1, CudaExpm1Functor)
 
-DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
-DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LogitCUDA, CudaLogitFunctor, eps)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_DOUBLE_ATTRS(LeakyRelu,
+                                            CudaLeakyReluFunctor,
+                                            alpha)
+DEFINE_GPU_ACT_KERNEL_WITH_ONE_DOUBLE_ATTRS(LogitCUDA, CudaLogitFunctor, eps)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(HardShrink,
                                      CudaHardShrinkFunctor,
                                      threshold)
@@ -286,13 +301,9 @@ void PowKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #ifdef PADDLE_WITH_HIP
-PD_CUSTOM_KERNEL_REGISTER(relu,
-                          metax_gpu,
-                          ALL_LAYOUT,
-                          phi::ReluKernel,
-                          float,
-                          double,
-                          phi::dtype::float16) {}
+PD_CUSTOM_KERNEL_REGISTER(
+    relu, metax_gpu, ALL_LAYOUT, phi::ReluKernel, float, double, phi::float16) {
+}
 #else
 PD_CUSTOM_KERNEL_REGISTER(relu,
                           metax_gpu,
@@ -300,8 +311,8 @@ PD_CUSTOM_KERNEL_REGISTER(relu,
                           phi::ReluKernel,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 #endif
 
 #define PD_REGISTER_ACTIVATION_KERNEL(name, func) \
@@ -311,8 +322,8 @@ PD_CUSTOM_KERNEL_REGISTER(relu,
                             phi::func,            \
                             float,                \
                             double,               \
-                            phi::dtype::float16,  \
-                            phi::dtype::bfloat16) {}
+                            phi::float16,         \
+                            phi::bfloat16) {}
 
 #define PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(name, func) \
   PD_CUSTOM_KERNEL_REGISTER(name,                              \
@@ -321,10 +332,10 @@ PD_CUSTOM_KERNEL_REGISTER(relu,
                             phi::func,                         \
                             float,                             \
                             double,                            \
-                            phi::dtype::float16,               \
-                            phi::dtype::bfloat16,              \
-                            phi::dtype::complex<float>,        \
-                            phi::dtype::complex<double>) {}
+                            phi::float16,                      \
+                            phi::bfloat16,                     \
+                            phi::complex64,                    \
+                            phi::complex128) {}
 
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sin, SinKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(cos, CosKernel)
@@ -357,10 +368,10 @@ PD_CUSTOM_KERNEL_REGISTER(exp,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(expm1,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -369,10 +380,10 @@ PD_CUSTOM_KERNEL_REGISTER(expm1,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(square,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -381,10 +392,10 @@ PD_CUSTOM_KERNEL_REGISTER(square,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 
 PD_REGISTER_ACTIVATION_KERNEL(hard_shrink, HardShrinkKernel)
 PD_REGISTER_ACTIVATION_KERNEL(softshrink, SoftShrinkKernel)
@@ -409,8 +420,8 @@ PD_CUSTOM_KERNEL_REGISTER(rint,
                           int64_t,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 PD_CUSTOM_KERNEL_REGISTER(round,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -419,10 +430,10 @@ PD_CUSTOM_KERNEL_REGISTER(round,
                           int64_t,
                           float,
                           double,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(log,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -431,10 +442,10 @@ PD_CUSTOM_KERNEL_REGISTER(log,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(log2,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -443,10 +454,10 @@ PD_CUSTOM_KERNEL_REGISTER(log2,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(log10,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -455,10 +466,10 @@ PD_CUSTOM_KERNEL_REGISTER(log10,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(log1p,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -467,10 +478,10 @@ PD_CUSTOM_KERNEL_REGISTER(log1p,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(pow,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -479,10 +490,10 @@ PD_CUSTOM_KERNEL_REGISTER(pow,
                           double,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>,
-                          phi::dtype::complex<double>) {}
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
 PD_CUSTOM_KERNEL_REGISTER(ceil,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -494,8 +505,8 @@ PD_CUSTOM_KERNEL_REGISTER(ceil,
                           int16_t,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
 PD_CUSTOM_KERNEL_REGISTER(floor,
                           metax_gpu,
                           ALL_LAYOUT,
@@ -507,5 +518,5 @@ PD_CUSTOM_KERNEL_REGISTER(floor,
                           int16_t,
                           int,
                           int64_t,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16) {}
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/mmha_util.cu.h b/backends/metax_gpu/kernels/metax_kernel/mmha_util.cu.h
index aa352e600b5..187b0fc534a 100644
--- a/backends/metax_gpu/kernels/metax_kernel/mmha_util.cu.h
+++ b/backends/metax_gpu/kernels/metax_kernel/mmha_util.cu.h
@@ -49,10 +49,10 @@
 
 #pragma once
 
-#if defined(__CUDACC__) && CUDA_VERSION >= 11000
+// #if defined(__CUDACC__) && CUDA_VERSION >= 11000
 #define ENABLE_BF16
 #include <cuda_bf16.h>
-#endif
+// #endif
 
 #ifdef PADDLE_WITH_HIP
 #include <float.h>
@@ -72,8 +72,8 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/phi/common/datatype_traits.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
 #ifdef PADDLE_WITH_HIP
 /// integral_constant
 template <typename _Tp, _Tp __v>
@@ -130,7 +130,7 @@ struct Float4_ {
   float2 y;
 };
 
-#if defined(ENABLE_BF16) || defined(PADDLE_WITH_HIP)
+// #if defined(ENABLE_BF16) || defined(PADDLE_WITH_HIP)
 struct bf16_4_t {
   __nv_bfloat162 x;
   __nv_bfloat162 y;
@@ -142,7 +142,7 @@ struct bf16_8_t {
   __nv_bfloat162 z;
   __nv_bfloat162 w;
 };
-#endif
+// #endif
 
 //-----------------------------------
 template <typename T, CacheType CACHE_TYPE>
diff --git a/backends/metax_gpu/tests/ignore.txt b/backends/metax_gpu/tests/ignore.txt
index be0357e5319..2b0fae559e6 100644
--- a/backends/metax_gpu/tests/ignore.txt
+++ b/backends/metax_gpu/tests/ignore.txt
@@ -24,9 +24,9 @@ test_conv3d_layer
 test_conv3d_transpose_part2_op
 test_fused_conv2d_add_act_op
 test_swiglu_metax
-test_set_value_op
-test_pad_op
 test_squared_l2_norm_op
-test_concat_op
 test_dygraph_spectral_norm
 test_bincount_op
+test_adamw_op
+test_einsum_op
+test_complex_matmul

From 8f161637ce03c6501e2aae5eba993b2ad1ef8778 Mon Sep 17 00:00:00 2001
From: duqimeng <77875733+duqimeng@users.noreply.github.com>
Date: Tue, 21 Oct 2025 16:11:49 +0800
Subject: [PATCH 83/95] updata_patch (#120)

* updata_patch

---------
---
 backends/metax_gpu/patch/paddle.patch | 25 ++++++-------------------
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 70553934dfb..4c844e5cc82 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -50,7 +50,7 @@ index 62beb53cfe..0b0ac09fc0 100644
      }                                                                       \
    };                                                                        \
 diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h
-index 0527e743e7..247a844f18 100644
+index 8b2e08c777..ca926df151 100644
 --- a/paddle/phi/backends/dynload/cublasLt.h
 +++ b/paddle/phi/backends/dynload/cublasLt.h
 @@ -46,12 +46,14 @@ extern void *cublasLt_dso_handle;
@@ -68,7 +68,7 @@ index 0527e743e7..247a844f18 100644
    extern DynLoad__##__name __name
 -
  // APIs available after CUDA 11.1
- #if CUDA_VERSION >= 11010
+ #if CUDA_VERSION >= 11010 || defined(PADDLE_WITH_CUSTOM_DEVICE)
  #define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
 @@ -79,8 +81,8 @@ extern void *cublasLt_dso_handle;
    __macro(cublasLtMatmulAlgoConfigGetAttribute);    \
@@ -440,6 +440,7 @@ index 024a7de73e..66b373d698 100644
      }                                                              \
    } while (0)
  #elif defined(__HIPCC__)
+
 diff --git a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
 index e63b3d2f6e..95d7e6f204 100644
 --- a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
@@ -470,7 +471,7 @@ index e63b3d2f6e..95d7e6f204 100644
          for (const auto& [seed, algo] : algo_caches_) {
            outfile << seed << " ";
 diff --git a/paddle/phi/kernels/funcs/cublaslt.h b/paddle/phi/kernels/funcs/cublaslt.h
-index e7e1dd2370..583c7d6474 100644
+index fbbf57c25a..f690db59e9 100644
 --- a/paddle/phi/kernels/funcs/cublaslt.h
 +++ b/paddle/phi/kernels/funcs/cublaslt.h
 @@ -42,19 +42,11 @@ class CublasLtHelper {
@@ -569,20 +570,6 @@ index e5361b836e..5ad238df08 100644
    return val;
  }
  
-diff --git a/paddle/phi/kernels/funcs/quant_dequant.h b/paddle/phi/kernels/funcs/quant_dequant.h
-index 8f0736f64e..f11c29a6ef 100644
---- a/paddle/phi/kernels/funcs/quant_dequant.h
-+++ b/paddle/phi/kernels/funcs/quant_dequant.h
-@@ -19,9 +19,7 @@ limitations under the License. */
- #include "paddle/phi/backends/gpu/gpu_launch_config.h"
- #include "paddle/phi/common/transform.h"
- #include "paddle/phi/kernels/funcs/aligned_vector.h"
--#ifndef PADDLE_WITH_CUSTOM_DEVICE
- #include "paddle/phi/kernels/funcs/blas/blas.h"
--#endif
- namespace phi {
- 
- using backends::gpu::GpuLaunchConfig;
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..108edda7ca 100644
 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -893,7 +880,7 @@ index b2d15a59f8..f64582e85a 100644
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
-index f0cca0f701..02ea957240 100644
+index 2edac5eba5..4f265e3db7 100644
 --- a/paddle/phi/kernels/gpu/depthwise_conv.h
 +++ b/paddle/phi/kernels/gpu/depthwise_conv.h
 @@ -29,8 +29,8 @@ namespace cub = hipcub;
@@ -959,7 +946,7 @@ index 63c35dd4ee..15da9aea45 100644
  namespace phi {
  
 diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
-index 1bdbe1564c..f753b54bc6 100644
+index c7f27b2924..4cf6204ac7 100644
 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
 +++ b/paddle/phi/kernels/gpu/lstsq_kernel.cu
 @@ -21,7 +21,7 @@

From b272dbe557db51ffe0def0b38e5d697c721b3995 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 22 Oct 2025 09:53:37 +0800
Subject: [PATCH 84/95] Update Paddle submodule to latest develop (#121)

Co-authored-by: tianshuo78520a <tianshuo78520a@users.noreply.github.com>
---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index 5dbecdcb0e4..1f00e2178ad 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 5dbecdcb0e4ddd3488927f49082dfb66c794f9e7
+Subproject commit 1f00e2178ad3249ecd8bb83e59bc6ac1ebcac413

From dc38f3d79c539796767a7454ca1fcd76486441db Mon Sep 17 00:00:00 2001
From: jxwangmetax <189149612@qq.com>
Date: Wed, 22 Oct 2025 10:23:24 +0800
Subject: [PATCH 85/95] [metax] modify kernels (#122)

* modify kernels
---
 backends/metax_gpu/patch/paddle.patch | 158 +++++++++++++++++++++++++-
 1 file changed, 157 insertions(+), 1 deletion(-)

diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 4c844e5cc82..6578029129e 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -440,7 +440,163 @@ index 024a7de73e..66b373d698 100644
      }                                                              \
    } while (0)
  #elif defined(__HIPCC__)
-
+diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+index ae7b67de6d..fbe9f67737 100644
+--- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
++++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h
+@@ -368,7 +368,7 @@ struct CUBlas<phi::float16> {
+                          cudaDataType_t Ctype,
+                          int ldc,
+                          int batchCount,
+-                         cudaDataType_t computeType) {
++                         cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 8000
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+ #if CUDA_VERSION >= 9000
+@@ -476,7 +476,7 @@ struct CUBlas<phi::float16> {
+                       void *C,
+                       cudaDataType_t Ctype,
+                       int ldc,
+-                      cudaDataType_t computeType) {
++                      cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 8000
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+ #if CUDA_VERSION >= 9000
+@@ -532,7 +532,7 @@ struct CUBlas<phi::float16> {
+                          void *C,
+                          cudaDataType_t Ctype,
+                          int64_t ldc,
+-                         cudaDataType_t computeType) {
++                         cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 12030 && defined(__linux__)
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+     bool use_tensor_op_math = dev_ctx->tensor_core_available();
+@@ -759,7 +759,7 @@ struct CUBlas<phi::complex64> {
+                       void *C,
+                       cudaDataType_t Ctype,
+                       int ldc,
+-                      cudaDataType_t computeType) {
++                      cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 8000
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+ #if CUDA_VERSION >= 9000
+@@ -815,7 +815,7 @@ struct CUBlas<phi::complex64> {
+                          void *C,
+                          cudaDataType_t Ctype,
+                          int64_t ldc,
+-                         cudaDataType_t computeType) {
++                         cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 12030 && defined(__linux__)
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+     bool use_tensor_op_math = dev_ctx->tensor_core_available();
+@@ -1154,7 +1154,7 @@ struct CUBlas<phi::complex128> {
+                       void *C,
+                       cudaDataType_t Ctype,
+                       int ldc,
+-                      cudaDataType_t computeType) {
++                      cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 8000
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+ #if CUDA_VERSION >= 9000
+@@ -1210,7 +1210,7 @@ struct CUBlas<phi::complex128> {
+                          void *C,
+                          cudaDataType_t Ctype,
+                          int64_t ldc,
+-                         cudaDataType_t computeType) {
++                         cublasComputeType_t computeType) {
+ #if CUDA_VERSION >= 12030 && defined(__linux__)
+     cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+     bool use_tensor_op_math = dev_ctx->tensor_core_available();
+@@ -1484,7 +1484,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                      C,
+                                      CUDA_R_16F,
+                                      N,
+-                                     CUDA_R_32F);
++                                     CUBLAS_COMPUTE_32F);
+ #else
+     PADDLE_THROW(common::errors::Unimplemented(
+         "GEMM_EX_64 is not supported on cuda < 12.3"));
+@@ -1508,7 +1508,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                   C,
+                                   CUDA_R_16F,
+                                   static_cast<int>(N),
+-                                  CUDA_R_32F);
++                                  CUBLAS_COMPUTE_32F);
+   }
+ #else
+   // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+@@ -1694,7 +1694,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                      C,
+                                      CUDA_R_16F,
+                                      N,
+-                                     CUDA_R_32F);
++                                     CUBLAS_COMPUTE_32F);
+ #else
+     PADDLE_THROW(common::errors::Unimplemented(
+         "GEMM_EX_64 is not supported on cuda < 12.3"));
+@@ -1719,7 +1719,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                   C,
+                                   CUDA_R_16F,
+                                   static_cast<int>(N),
+-                                  CUDA_R_32F);
++                                  CUBLAS_COMPUTE_32F);
+ #else
+     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+     dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+@@ -1831,7 +1831,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                      C,
+                                      CUDA_R_16BF,
+                                      static_cast<int>(N),
+-                                     CUDA_R_32F,
++                                     CUBLAS_COMPUTE_32F,
+                                      algo));
+     });
+   }
+@@ -1932,7 +1932,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                      C,
+                                      CUDA_R_16BF,
+                                      static_cast<int>(N),
+-                                     CUDA_R_32F,
++                                     CUBLAS_COMPUTE_32F,
+                                      algo));
+     });
+   }
+@@ -2026,7 +2026,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                     C,
+                                     CUDA_C_32F,
+                                     static_cast<int>(N),
+-                                    CUDA_C_32F);
++                                    CUBLAS_COMPUTE_32F);
+ 
+ #else
+     dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+@@ -2111,7 +2111,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         C,
+                                         CUDA_C_64F,
+                                         N,
+-                                        CUDA_C_64F);
++                                        CUBLAS_COMPUTE_64F);
+ #else
+     PADDLE_THROW(common::errors::Unimplemented(
+         "GEMM_EX_64 is not supported on cuda < 12.3"));
+@@ -2136,7 +2136,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                      C,
+                                      CUDA_C_64F,
+                                      static_cast<int>(N),
+-                                     CUDA_C_64F);
++                                     CUBLAS_COMPUTE_64F);
+ #else  // CUDA_VERSION >= 8000
+     // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+     dev_ctx_.CublasCall([&](cublasHandle_t handle) {
+@@ -3129,7 +3129,7 @@ inline void Blas<phi::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
+                                    CUDA_R_16F,
+                                    ldc,
+                                    batchCount,
+-                                   CUDA_R_32F);
++                                   CUBLAS_COMPUTE_32F);
+ }
+ 
+ template <>
 diff --git a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h b/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h
 index e63b3d2f6e..95d7e6f204 100644
 --- a/paddle/phi/kernels/funcs/blas/blaslt_gemm_search.h

From 342ff813f2a5935a2503fb6d2eead929f8607508 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Thu, 23 Oct 2025 09:58:26 +0800
Subject: [PATCH 86/95] [Metax] fix weight_quant & weight_only_linear bug

---
 .../kernels/metax_kernel/weight_only_linear_kernel.cu         | 4 ++--
 .../kernels/metax_kernel/weight_quantize_kernel_register.cu   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
index d2f39ccf751..65cf99d3065 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
@@ -166,7 +166,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
           mctlassGemmScaleOp_w4a16_nobias::epilogueParams(
               reinterpret_cast<const maca_bfloat16*>(bias_data)),
           mctlassGemmScaleOp_w4a16_nobias::quantscaleParams(
-              1,
+              2,
               group_size,
               reinterpret_cast<const maca_bfloat16*>(weight_scale_data)),
           reinterpret_cast<const maca_bfloat16*>(x_data),
@@ -191,7 +191,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
           mctlassGemmScaleOp_w4a16_bias::epilogueParams(
               reinterpret_cast<const maca_bfloat16*>(bias_data)),
           mctlassGemmScaleOp_w4a16_bias::quantscaleParams(
-              1,
+              2,
               group_size,
               reinterpret_cast<const maca_bfloat16*>(weight_scale_data)),
           reinterpret_cast<const maca_bfloat16*>(x_data),
diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index 44ac7f2fddc..46045f55c27 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -120,7 +120,7 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                                  weight_shape,
                                  arch,
                                  algo);
-    out->Resize({m, n});
+    out->Resize({m, n / 2});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});

From 5fe7108e40ac7179ad8cce5967f5f8fe9d15e7f0 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Thu, 23 Oct 2025 10:01:26 +0800
Subject: [PATCH 87/95] [Metax] fix weight_quant & weight_only_linear bug
 (#125)

* [Metax] fix weight_quant & weight_only_linear bug
---
 .../kernels/metax_kernel/weight_only_linear_kernel.cu         | 4 ++--
 .../kernels/metax_kernel/weight_quantize_kernel_register.cu   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
index d2f39ccf751..65cf99d3065 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
@@ -166,7 +166,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
           mctlassGemmScaleOp_w4a16_nobias::epilogueParams(
               reinterpret_cast<const maca_bfloat16*>(bias_data)),
           mctlassGemmScaleOp_w4a16_nobias::quantscaleParams(
-              1,
+              2,
               group_size,
               reinterpret_cast<const maca_bfloat16*>(weight_scale_data)),
           reinterpret_cast<const maca_bfloat16*>(x_data),
@@ -191,7 +191,7 @@ void WeightOnlyLinearKernel(const Context& dev_ctx,
           mctlassGemmScaleOp_w4a16_bias::epilogueParams(
               reinterpret_cast<const maca_bfloat16*>(bias_data)),
           mctlassGemmScaleOp_w4a16_bias::quantscaleParams(
-              1,
+              2,
               group_size,
               reinterpret_cast<const maca_bfloat16*>(weight_scale_data)),
           reinterpret_cast<const maca_bfloat16*>(x_data),
diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index 44ac7f2fddc..46045f55c27 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -120,7 +120,7 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                                  weight_shape,
                                  arch,
                                  algo);
-    out->Resize({m, n});
+    out->Resize({m, n / 2});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});

From 14a340c28b778cb9926740fb7bd39879af31d449 Mon Sep 17 00:00:00 2001
From: ZhouDuan <136539532+1184319564@users.noreply.github.com>
Date: Fri, 24 Oct 2025 10:27:19 +0800
Subject: [PATCH 88/95] fix and add some kernels (#126)

* fix and add some kernels
---
 ...used_gemm_epilogue_grad_kernel_register.cu | 26 +++++++++++++++++++
 .../fused_gemm_epilogue_kernel_register.cu    | 26 +++++++++++++++++++
 ...d_linear_param_grad_add_kernel_register.cu | 24 +++++++++++++++++
 .../cuda_kernels/pad_grad_kernel_register.cu  |  8 +++---
 .../softmax_kernel_grad_register.cu           |  1 +
 5 files changed, 82 insertions(+), 3 deletions(-)
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_grad_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_kernel_register.cu
 create mode 100644 backends/metax_gpu/kernels/cuda_kernels/fused_linear_param_grad_add_kernel_register.cu

diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_grad_kernel_register.cu
new file mode 100644
index 00000000000..2e8d33b964c
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_grad_kernel_register.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/fused_gemm_epilogue.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_gemm_epilogue_grad_kernel.cu"  //NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_gemm_epilogue_grad,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::FusedGemmEpilogueGradKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_kernel_register.cu
new file mode 100644
index 00000000000..9be5794c54f
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/fused_gemm_epilogue_kernel_register.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/fused_gemm_epilogue.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_gemm_epilogue_kernel.cu"  //NOLINT
+
+PD_CUSTOM_KERNEL_REGISTER(fused_gemm_epilogue,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::FusedGemmEpilogueKernel,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_linear_param_grad_add_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/fused_linear_param_grad_add_kernel_register.cu
new file mode 100644
index 00000000000..c88f94625b7
--- /dev/null
+++ b/backends/metax_gpu/kernels/cuda_kernels/fused_linear_param_grad_add_kernel_register.cu
@@ -0,0 +1,24 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_linear_param_grad_add_kernel.cu"  //NOLINT
+PD_CUSTOM_KERNEL_REGISTER(fused_linear_param_grad_add,
+                          metax_gpu,
+                          ALL_LAYOUT,
+                          phi::fusion::FusedLinearParamGradAdd,
+                          float,
+                          double,
+                          phi::float16,
+                          phi::bfloat16) {}
diff --git a/backends/metax_gpu/kernels/cuda_kernels/pad_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/pad_grad_kernel_register.cu
index 38b89fce698..f87f589a424 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/pad_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/pad_grad_kernel_register.cu
@@ -20,6 +20,8 @@ PD_CUSTOM_KERNEL_REGISTER(pad_grad,
                           ALL_LAYOUT,
                           phi::PadGradKernel,
                           float,
-                          phi::dtype::float16,
-                          phi::dtype::bfloat16,
-                          phi::dtype::complex<float>) {}
+                          double,
+                          phi::float16,
+                          phi::bfloat16,
+                          phi::complex64,
+                          phi::complex128) {}
diff --git a/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
index 9b981029fc0..407180deca8 100644
--- a/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
@@ -45,5 +45,6 @@ PD_REGISTER_PLUGIN_KERNEL(softmax_grad,
                           ALL_LAYOUT,
                           phi::SoftmaxGradGPUDNNKernel,
                           float,
+                          double,
                           phi::dtype::float16,
                           phi::dtype::bfloat16) {}

From f507479eaabe013c0605aee3528df550d38ad440 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Tue, 28 Oct 2025 09:52:16 +0800
Subject: [PATCH 89/95] [Metax] fix 'WeightQuantizeKernel' wint4 branch

---
 .../kernels/metax_kernel/weight_quantize_kernel_register.cu     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index 46045f55c27..cb80385a7a0 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -120,7 +120,7 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                                  weight_shape,
                                  arch,
                                  algo);
-    out->Resize({m, n / 2});
+    out->Resize({m / 2, n});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});

From faac2c969d9b609d3e5443c43ad55e958b6de5b3 Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Tue, 28 Oct 2025 09:55:12 +0800
Subject: [PATCH 90/95] [Metax] fix 'WeightQuantizeKernel' wint4 branch (#133)

* [Metax] fix 'WeightQuantizeKernel' wint4 branch
---
 .../kernels/metax_kernel/weight_quantize_kernel_register.cu     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index 46045f55c27..cb80385a7a0 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -120,7 +120,7 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                                  weight_shape,
                                  arch,
                                  algo);
-    out->Resize({m, n / 2});
+    out->Resize({m / 2, n});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});

From b3c816b2a58ba97b5460dad0064cf90100c5aafd Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Wed, 29 Oct 2025 09:51:12 +0800
Subject: [PATCH 91/95] [Metax] add quanted weight layout transformation using
 CPU programming

---
 .../impl/metax_weight_quantize_kernel_impl.h  | 149 ++++++++++++++++++
 .../weight_quantize_kernel_register.cu        |   3 +-
 2 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h

diff --git a/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
new file mode 100644
index 00000000000..e6ff489b3dc
--- /dev/null
+++ b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
@@ -0,0 +1,149 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+
+void cpu_2d_tensor_transpose(const DenseTensor& input_data,
+                             DenseTensor* transposed_data) {
+  const int64_t input_data_rows = input_data.dims()[0];
+  const int64_t input_data_cols = input_data.dims()[1];
+
+  const int8_t* input_data_ptr = input_data.data<int8_t>();
+  int8_t* transposed_data_ptr = transposed_data->data<int8_t>();
+
+  for (int64_t r = 0; r < input_data_rows; r++) {
+    for (int64_t c = 0; c < input_data_cols; c++) {
+      *(transposed_data_ptr + r + c * input_data_rows) =
+          *(input_data_ptr + r * input_data_cols + c);
+    }
+  }
+}
+
+void cpu_int4_quanted_weight_raw_unpack(const DenseTensor& packed_data,
+                                        DenseTensor* unpacked_data) {
+  const int64_t packed_data_rows = packed_data.dims()[0];
+  const int64_t packed_data_cols = packed_data.dims()[1];
+
+  const int8_t* packed_data_ptr = packed_data.data<int8_t>();
+  int8_t* unpacked_data_ptr = unpacked_data->data<int8_t>();
+
+  for (int64_t c = 0; c < packed_data_cols; c++) {
+    for (int64_t r = 0; r < packed_data_rows; r++) {
+      int8_t val = *(packed_data_ptr + r * packed_data_cols + c);
+      int8_t low_int4 = val & 0x0f;
+      int8_t hight_int4 = (val >> 4) & 0x0f;
+
+      *(unpacked_data_ptr + (2 * r) * packed_data_cols + c) =
+          low_int4 >= 8 ? low_int4 - 16 : low_int4;
+      *(unpacked_data_ptr + (2 * r + 1) * packed_data_cols + c) =
+          hight_int4 >= 8 ? hight_int4 - 16 : hight_int4;
+    }
+  }
+}
+
+void cpu_int4_quanted_weight_col_pack(const DenseTensor& unpacked_data,
+                                      DenseTensor* packed_data) {
+  const int64_t packed_data_rows = packed_data->dims()[0];
+  const int64_t packed_data_cols = packed_data->dims()[1];
+
+  int8_t* packed_data_ptr = packed_data->data<int8_t>();
+  const int8_t* unpacked_data_ptr = unpacked_data.data<int8_t>();
+
+  for (int64_t r = 0; r < packed_data_rows; r++) {
+    for (int64_t c = 0; c < packed_data_cols; c++) {
+      int8_t low_int4 = *(unpacked_data_ptr + 2 * r * packed_data_cols + 2 * c);
+      int8_t hight_int4 =
+          *(unpacked_data_ptr + 2 * r * packed_data_cols + 2 * c + 1);
+
+      low_int4 = low_int4 < 0 ? low_int4 + 16 : low_int4;
+      hight_int4 = hight_int4 < 0 ? hight_int4 + 16 : hight_int4;
+
+      *(packed_data_ptr + r * packed_data_cols + c) =
+          ((hight_int4 << 4) & 0xf0) | (low_int4 & 0x0f);
+    }
+  }
+}
+
+void show_2d_cpu_tensor(const DenseTensor& tensor, const int64_t size = 3) {
+  const int64_t rows = tensor.dims()[0];
+  const int64_t cols = tensor.dims()[1];
+  printf("\nTensor shape = [%d, %d]\n", rows, cols);
+
+  const int8_t* cpu_ptr = tensor.data<int8_t>();
+
+  for (int r = 0; r < size; r++) {
+    for (int c = 0; c < size; c++) {
+      int8_t val = *(cpu_ptr + r * cols + c);
+      printf("%d ", val);
+    }
+    printf("\n");
+  }
+  printf("\n\n");
+}
+
+template <typename Context>
+void MetaxQuantizedWeightLayoutTrans(const Context& dev_ctx,
+                                     const std::string& algo,
+                                     const std::vector<int64_t>& shape,
+                                     DenseTensor* out) {
+  const int64_t m = shape[0];
+  const int64_t n = shape[1];
+
+  phi::CPUPlace cpu_place;
+
+  if (algo == "weight_only_int4") {
+    out->Resize({m / 2, n});
+
+    DenseTensor out_cpu_tensor;
+    phi::Copy(dev_ctx, (*out), cpu_place, true, &out_cpu_tensor);
+
+    // raw unpack
+    DenseTensor raw_unpack_tensor;
+    raw_unpack_tensor.Resize({out_cpu_tensor.dims()[0] * 2, n});
+    raw_unpack_tensor.mutable_data<int8_t>(cpu_place);
+    cpu_int4_quanted_weight_raw_unpack(out_cpu_tensor, &raw_unpack_tensor);
+
+    // transpose
+    DenseTensor transposed_tensor;
+    transposed_tensor.Resize(
+        {raw_unpack_tensor.dims()[1], raw_unpack_tensor.dims()[0]});
+    transposed_tensor.mutable_data<int8_t>(cpu_place);
+    cpu_2d_tensor_transpose(raw_unpack_tensor, &transposed_tensor);
+
+    // col pack
+    out_cpu_tensor.Resize(
+        {transposed_tensor.dims()[0], transposed_tensor.dims()[1] / 2});
+    cpu_int4_quanted_weight_col_pack(transposed_tensor, &out_cpu_tensor);
+
+    out_cpu_tensor.Resize({n / 2, m});
+    out->Resize({n / 2, m});
+    phi::Copy(dev_ctx, out_cpu_tensor, dev_ctx.GetPlace(), true, out);
+  } else {
+    PADDLE_FATAL(
+        "The algo must be in ['weight_only_int4'"
+        "], but got[%s]",
+        algo);
+  }
+}
+
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index cb80385a7a0..8d72ed2138e 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "../impl/metax_weight_quantize_kernel_impl.h"
 #include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/datatype_traits.h"
@@ -120,7 +121,6 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                                  weight_shape,
                                  arch,
                                  algo);
-    out->Resize({m / 2, n});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});
@@ -141,6 +141,7 @@ void WeightQuantizeKernel(const Context& dev_ctx,
     //                             arch,
     //                             algo);
 #endif
+    MetaxQuantizedWeightLayoutTrans<Context>(dev_ctx, algo, weight_shape, out);
   } else if (algo == "w4a8") {
     weight_permute_gpu_w4a8<Context>(dev_ctx,
                                      x.data<int8_t>(),

From 181772da5655782eebe905aca05d1ca612af9a46 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Wed, 29 Oct 2025 09:59:26 +0800
Subject: [PATCH 92/95] [Metax] adjust quanted weight layout transformation

---
 .../metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
index e6ff489b3dc..3452cceb74e 100644
--- a/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
@@ -18,6 +18,7 @@
 
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 

From 29630cbb408061521a65129fb68bb1c5d3e9814f Mon Sep 17 00:00:00 2001
From: MingkunZhang <39252862+StareAtYou@users.noreply.github.com>
Date: Wed, 29 Oct 2025 10:18:17 +0800
Subject: [PATCH 93/95] [Metax] add quanted weight layout transformation using
 CPU programming (#135)

* [Metax] adjust quanted weight layout transformation
---
 .../impl/metax_weight_quantize_kernel_impl.h  | 150 ++++++++++++++++++
 .../weight_quantize_kernel_register.cu        |   3 +-
 2 files changed, 152 insertions(+), 1 deletion(-)
 create mode 100644 backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h

diff --git a/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
new file mode 100644
index 00000000000..3452cceb74e
--- /dev/null
+++ b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
@@ -0,0 +1,150 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
+
+namespace phi {
+
+void cpu_2d_tensor_transpose(const DenseTensor& input_data,
+                             DenseTensor* transposed_data) {
+  const int64_t input_data_rows = input_data.dims()[0];
+  const int64_t input_data_cols = input_data.dims()[1];
+
+  const int8_t* input_data_ptr = input_data.data<int8_t>();
+  int8_t* transposed_data_ptr = transposed_data->data<int8_t>();
+
+  for (int64_t r = 0; r < input_data_rows; r++) {
+    for (int64_t c = 0; c < input_data_cols; c++) {
+      *(transposed_data_ptr + r + c * input_data_rows) =
+          *(input_data_ptr + r * input_data_cols + c);
+    }
+  }
+}
+
+void cpu_int4_quanted_weight_raw_unpack(const DenseTensor& packed_data,
+                                        DenseTensor* unpacked_data) {
+  const int64_t packed_data_rows = packed_data.dims()[0];
+  const int64_t packed_data_cols = packed_data.dims()[1];
+
+  const int8_t* packed_data_ptr = packed_data.data<int8_t>();
+  int8_t* unpacked_data_ptr = unpacked_data->data<int8_t>();
+
+  for (int64_t c = 0; c < packed_data_cols; c++) {
+    for (int64_t r = 0; r < packed_data_rows; r++) {
+      int8_t val = *(packed_data_ptr + r * packed_data_cols + c);
+      int8_t low_int4 = val & 0x0f;
+      int8_t hight_int4 = (val >> 4) & 0x0f;
+
+      *(unpacked_data_ptr + (2 * r) * packed_data_cols + c) =
+          low_int4 >= 8 ? low_int4 - 16 : low_int4;
+      *(unpacked_data_ptr + (2 * r + 1) * packed_data_cols + c) =
+          hight_int4 >= 8 ? hight_int4 - 16 : hight_int4;
+    }
+  }
+}
+
+void cpu_int4_quanted_weight_col_pack(const DenseTensor& unpacked_data,
+                                      DenseTensor* packed_data) {
+  const int64_t packed_data_rows = packed_data->dims()[0];
+  const int64_t packed_data_cols = packed_data->dims()[1];
+
+  int8_t* packed_data_ptr = packed_data->data<int8_t>();
+  const int8_t* unpacked_data_ptr = unpacked_data.data<int8_t>();
+
+  for (int64_t r = 0; r < packed_data_rows; r++) {
+    for (int64_t c = 0; c < packed_data_cols; c++) {
+      int8_t low_int4 = *(unpacked_data_ptr + 2 * r * packed_data_cols + 2 * c);
+      int8_t hight_int4 =
+          *(unpacked_data_ptr + 2 * r * packed_data_cols + 2 * c + 1);
+
+      low_int4 = low_int4 < 0 ? low_int4 + 16 : low_int4;
+      hight_int4 = hight_int4 < 0 ? hight_int4 + 16 : hight_int4;
+
+      *(packed_data_ptr + r * packed_data_cols + c) =
+          ((hight_int4 << 4) & 0xf0) | (low_int4 & 0x0f);
+    }
+  }
+}
+
+void show_2d_cpu_tensor(const DenseTensor& tensor, const int64_t size = 3) {
+  const int64_t rows = tensor.dims()[0];
+  const int64_t cols = tensor.dims()[1];
+  printf("\nTensor shape = [%d, %d]\n", rows, cols);
+
+  const int8_t* cpu_ptr = tensor.data<int8_t>();
+
+  for (int r = 0; r < size; r++) {
+    for (int c = 0; c < size; c++) {
+      int8_t val = *(cpu_ptr + r * cols + c);
+      printf("%d ", val);
+    }
+    printf("\n");
+  }
+  printf("\n\n");
+}
+
+template <typename Context>
+void MetaxQuantizedWeightLayoutTrans(const Context& dev_ctx,
+                                     const std::string& algo,
+                                     const std::vector<int64_t>& shape,
+                                     DenseTensor* out) {
+  const int64_t m = shape[0];
+  const int64_t n = shape[1];
+
+  phi::CPUPlace cpu_place;
+
+  if (algo == "weight_only_int4") {
+    out->Resize({m / 2, n});
+
+    DenseTensor out_cpu_tensor;
+    phi::Copy(dev_ctx, (*out), cpu_place, true, &out_cpu_tensor);
+
+    // raw unpack
+    DenseTensor raw_unpack_tensor;
+    raw_unpack_tensor.Resize({out_cpu_tensor.dims()[0] * 2, n});
+    raw_unpack_tensor.mutable_data<int8_t>(cpu_place);
+    cpu_int4_quanted_weight_raw_unpack(out_cpu_tensor, &raw_unpack_tensor);
+
+    // transpose
+    DenseTensor transposed_tensor;
+    transposed_tensor.Resize(
+        {raw_unpack_tensor.dims()[1], raw_unpack_tensor.dims()[0]});
+    transposed_tensor.mutable_data<int8_t>(cpu_place);
+    cpu_2d_tensor_transpose(raw_unpack_tensor, &transposed_tensor);
+
+    // col pack
+    out_cpu_tensor.Resize(
+        {transposed_tensor.dims()[0], transposed_tensor.dims()[1] / 2});
+    cpu_int4_quanted_weight_col_pack(transposed_tensor, &out_cpu_tensor);
+
+    out_cpu_tensor.Resize({n / 2, m});
+    out->Resize({n / 2, m});
+    phi::Copy(dev_ctx, out_cpu_tensor, dev_ctx.GetPlace(), true, out);
+  } else {
+    PADDLE_FATAL(
+        "The algo must be in ['weight_only_int4'"
+        "], but got[%s]",
+        algo);
+  }
+}
+
+}  // namespace phi
diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index cb80385a7a0..8d72ed2138e 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include "../impl/metax_weight_quantize_kernel_impl.h"
 #include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/datatype_traits.h"
@@ -120,7 +121,6 @@ void WeightQuantizeKernel(const Context& dev_ctx,
                                  weight_shape,
                                  arch,
                                  algo);
-    out->Resize({m / 2, n});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});
@@ -141,6 +141,7 @@ void WeightQuantizeKernel(const Context& dev_ctx,
     //                             arch,
     //                             algo);
 #endif
+    MetaxQuantizedWeightLayoutTrans<Context>(dev_ctx, algo, weight_shape, out);
   } else if (algo == "w4a8") {
     weight_permute_gpu_w4a8<Context>(dev_ctx,
                                      x.data<int8_t>(),

From 6e0d1eb4d2698772848213c85cb2009fbc1bded4 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Wed, 29 Oct 2025 16:26:22 +0800
Subject: [PATCH 94/95] [Metax] add quanted weight layout transformation using
 GPU programming

---
 .../impl/metax_weight_quantize_kernel_impl.h  | 218 ++++++++++++++----
 1 file changed, 175 insertions(+), 43 deletions(-)

diff --git a/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
index 3452cceb74e..b305ec96a30 100644
--- a/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/metax_weight_quantize_kernel_impl.h
@@ -16,14 +16,60 @@
 
 #include <cstdint>
 
+#include "paddle/common/enforce.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
+void show_2d_cpu_tensor(const DenseTensor& tensor,
+                        const int64_t row_num = 3,
+                        const int64_t col_num = 3) {
+  const int64_t rows = tensor.dims()[0];
+  const int64_t cols = tensor.dims()[1];
+  printf("\nTensor shape = [%d, %d]\n", rows, cols);
+
+  const int8_t* cpu_ptr = tensor.data<int8_t>();
+
+  for (int r = 0; r < row_num; r++) {
+    for (int c = 0; c < col_num; c++) {
+      int8_t val = *(cpu_ptr + r * cols + c);
+      printf("%d ", val);
+    }
+    printf("\n");
+  }
+  printf("\n\n");
+}
+
+void show_2d_gpu_tensor(const CustomContext& dev_ctx,
+                        const DenseTensor& tensor,
+                        const int64_t row_num = 3,
+                        const int64_t col_num = 3) {
+  phi::CPUPlace cpu_place;
+
+  DenseTensor cpu_tensor;
+  phi::Copy(dev_ctx, tensor, cpu_place, true, &cpu_tensor);
+
+  const int64_t rows = cpu_tensor.dims()[0];
+  const int64_t cols = cpu_tensor.dims()[1];
+  printf("\nTensor shape = [%d, %d]\n", rows, cols);
+
+  const int8_t* cpu_ptr = cpu_tensor.data<int8_t>();
+
+  for (int r = 0; r < row_num; r++) {
+    for (int c = 0; c < col_num; c++) {
+      int8_t val = *(cpu_ptr + r * cols + c);
+      printf("%d ", val);
+    }
+    printf("\n");
+  }
+  printf("\n\n");
+}
+
 void cpu_2d_tensor_transpose(const DenseTensor& input_data,
                              DenseTensor* transposed_data) {
   const int64_t input_data_rows = input_data.dims()[0];
@@ -85,21 +131,132 @@ void cpu_int4_quanted_weight_col_pack(const DenseTensor& unpacked_data,
   }
 }
 
-void show_2d_cpu_tensor(const DenseTensor& tensor, const int64_t size = 3) {
-  const int64_t rows = tensor.dims()[0];
-  const int64_t cols = tensor.dims()[1];
-  printf("\nTensor shape = [%d, %d]\n", rows, cols);
+void cpu_int4_quantized_weight_layout_trans_impl(
+    const CustomContext& dev_ctx,
+    const std::vector<int64_t>& shape,
+    DenseTensor* out) {
+  const int64_t m = shape[0];
+  const int64_t n = shape[1];
 
-  const int8_t* cpu_ptr = tensor.data<int8_t>();
+  phi::CPUPlace cpu_place;
 
-  for (int r = 0; r < size; r++) {
-    for (int c = 0; c < size; c++) {
-      int8_t val = *(cpu_ptr + r * cols + c);
-      printf("%d ", val);
-    }
-    printf("\n");
+  out->Resize({m / 2, n});
+
+  DenseTensor out_cpu_tensor;
+  phi::Copy(dev_ctx, (*out), cpu_place, true, &out_cpu_tensor);
+
+  // raw unpack
+  DenseTensor raw_unpack_tensor;
+  raw_unpack_tensor.Resize({out_cpu_tensor.dims()[0] * 2, n});
+  raw_unpack_tensor.mutable_data<int8_t>(cpu_place);
+  cpu_int4_quanted_weight_raw_unpack(out_cpu_tensor, &raw_unpack_tensor);
+
+  // transpose
+  DenseTensor transposed_tensor;
+  transposed_tensor.Resize(
+      {raw_unpack_tensor.dims()[1], raw_unpack_tensor.dims()[0]});
+  transposed_tensor.mutable_data<int8_t>(cpu_place);
+  cpu_2d_tensor_transpose(raw_unpack_tensor, &transposed_tensor);
+
+  // col pack
+  out_cpu_tensor.Resize(
+      {transposed_tensor.dims()[0], transposed_tensor.dims()[1] / 2});
+  cpu_int4_quanted_weight_col_pack(transposed_tensor, &out_cpu_tensor);
+
+  out_cpu_tensor.Resize({n / 2, m});
+  out->Resize({n / 2, m});
+  phi::Copy(dev_ctx, out_cpu_tensor, dev_ctx.GetPlace(), true, out);
+}
+
+__global__ void int4_quanted_matrix_raw_unpack_kernel(const int8_t* mat,
+                                                      int8_t* unpack_mat,
+                                                      int M,
+                                                      int N) {
+  int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int i = global_idx / N;
+  int j = global_idx % N;
+
+  if (global_idx >= M * N) {
+    return;
   }
-  printf("\n\n");
+
+  int8_t val = mat[global_idx];
+  int8_t low = val & 0x0F;
+  int8_t mask = ((low & 0x80) == 0) & ((low & 0x78) != 0);
+  low -= 16 * mask;
+
+  int8_t high = (val >> 4) & 0x0F;
+  mask = ((high & 0x80) == 0) & ((high & 0x78) != 0);
+  high -= 16 * mask;
+
+  int output_global_idx0 = (2 * i) * N + j;
+  int output_global_idx1 = (2 * i + 1) * N + j;
+
+  unpack_mat[output_global_idx0] = low;
+  unpack_mat[output_global_idx1] = high;
+}
+
+__global__ void int4_quanted_matrix_col_pack_kernel(const int8_t* mat,
+                                                    int8_t* pack_mat,
+                                                    int M,
+                                                    int N) {
+  int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  int i = global_idx / N;
+  int j = global_idx % N;
+
+  if (global_idx >= M * N) {
+    return;
+  }
+
+  int mat_global_idx0 = i * 2 * N + 2 * j;
+  int mat_global_idx1 = i * 2 * N + 2 * j + 1;
+
+  int8_t low = mat[mat_global_idx0] & 0x0F;
+  low = low + ((low >> 3) & 1) * 16;
+
+  int8_t high = mat[mat_global_idx1] & 0x0F;
+  high = high + ((high >> 3) & 1) * 16;
+
+  pack_mat[global_idx] = ((high << 4) & 0xf0) | (low & 0x0f);
+}
+
+void gpu_int4_quantized_weight_layout_trans_impl(
+    const CustomContext& dev_ctx,
+    const std::vector<int64_t>& shape,
+    DenseTensor* out) {
+  int64_t total_m = shape[0];
+  int64_t total_n = shape[1];
+  out->Resize({total_m / 2, total_n});
+
+  DenseTensor unpack_mat(out->type());
+  unpack_mat.Resize({total_m, total_n});
+  dev_ctx.template Alloc<int8_t>(&unpack_mat);
+
+  constexpr int kBlockSize = 64;
+  int64_t kGridSize = (out->numel() + kBlockSize - 1) / kBlockSize;
+  int4_quanted_matrix_raw_unpack_kernel<<<kGridSize, kBlockSize>>>(
+      out->data<int8_t>(),
+      unpack_mat.data<int8_t>(),
+      out->dims()[0],
+      out->dims()[1]);
+
+  DenseTensor transposed_tensor;
+  transposed_tensor.Resize({unpack_mat.dims()[1], unpack_mat.dims()[0]});
+  dev_ctx.template Alloc<int8_t>(&transposed_tensor);
+  std::vector<int> axis = {1, 0};
+  funcs::Transpose<CustomContext, int8_t, 2> trans;
+  trans(dev_ctx, unpack_mat, &transposed_tensor, axis);
+
+  out->Resize({transposed_tensor.dims()[0], transposed_tensor.dims()[1] / 2});
+  int4_quanted_matrix_col_pack_kernel<<<kGridSize, kBlockSize>>>(
+      transposed_tensor.data<int8_t>(),
+      out->data<int8_t>(),
+      out->dims()[0],
+      out->dims()[1]);
+
+  out->Resize({total_n / 2, total_m});
 }
 
 template <typename Context>
@@ -107,38 +264,13 @@ void MetaxQuantizedWeightLayoutTrans(const Context& dev_ctx,
                                      const std::string& algo,
                                      const std::vector<int64_t>& shape,
                                      DenseTensor* out) {
-  const int64_t m = shape[0];
-  const int64_t n = shape[1];
-
-  phi::CPUPlace cpu_place;
-
   if (algo == "weight_only_int4") {
-    out->Resize({m / 2, n});
-
-    DenseTensor out_cpu_tensor;
-    phi::Copy(dev_ctx, (*out), cpu_place, true, &out_cpu_tensor);
-
-    // raw unpack
-    DenseTensor raw_unpack_tensor;
-    raw_unpack_tensor.Resize({out_cpu_tensor.dims()[0] * 2, n});
-    raw_unpack_tensor.mutable_data<int8_t>(cpu_place);
-    cpu_int4_quanted_weight_raw_unpack(out_cpu_tensor, &raw_unpack_tensor);
-
-    // transpose
-    DenseTensor transposed_tensor;
-    transposed_tensor.Resize(
-        {raw_unpack_tensor.dims()[1], raw_unpack_tensor.dims()[0]});
-    transposed_tensor.mutable_data<int8_t>(cpu_place);
-    cpu_2d_tensor_transpose(raw_unpack_tensor, &transposed_tensor);
-
-    // col pack
-    out_cpu_tensor.Resize(
-        {transposed_tensor.dims()[0], transposed_tensor.dims()[1] / 2});
-    cpu_int4_quanted_weight_col_pack(transposed_tensor, &out_cpu_tensor);
-
-    out_cpu_tensor.Resize({n / 2, m});
-    out->Resize({n / 2, m});
-    phi::Copy(dev_ctx, out_cpu_tensor, dev_ctx.GetPlace(), true, out);
+    if (dev_ctx.GetPlace() == phi::CPUPlace()) {
+      cpu_int4_quantized_weight_layout_trans_impl(dev_ctx, shape, out);
+    } else {
+      gpu_int4_quantized_weight_layout_trans_impl(dev_ctx, shape, out);
+    }
+
   } else {
     PADDLE_FATAL(
         "The algo must be in ['weight_only_int4'"

From f07af1c2f07a8586a568c14310cd965c95e9b7b2 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <tianshuo78520a@users.noreply.github.com>
Date: Wed, 29 Oct 2025 16:33:12 +0000
Subject: [PATCH 95/95] Update Paddle submodule to latest develop

---
 Paddle | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Paddle b/Paddle
index 1f00e2178ad..b51d1da36de 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit 1f00e2178ad3249ecd8bb83e59bc6ac1ebcac413
+Subproject commit b51d1da36debb9faaa4197629c82c0fe907a94c9