diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index fd7d04c0843..353cbb098b6 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -5,12 +5,6 @@ on:
   pull_request:
     types: [opened, synchronize]
     branches: [develop, release/**]
-    paths:
-      - "**"
-      - "Paddle/**"
-      - "!backends/**"
-      - "backends/metax_gpu/**"
-
 permissions: read-all
 
 defaults:
@@ -40,16 +34,48 @@ jobs:
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head
             git checkout pull/${{ github.event.pull_request.number }}/head
 
-            git submodule update --init --recursive
+
+
+
+            paddle_branch=${{ github.base_ref || github.ref_name}}
+            echo $paddle_branch
+            # sleep 10000
+            change_numbers=$(git diff --name-only remotes/origin/${paddle_branch} | wc -l)
+            echo $change_numbers
+
+
+            change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/" || true)
+            echo $change_backend
+            change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep -c "backends/metax_gpu" || true)
+            echo $change_metax_only
+
+            # change_backend=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/"| wc -l)
+            # echo $change_backend
+            # change_metax_only=$(git diff --name-only remotes/origin/${paddle_branch} | grep "backends/metax_gpu"| wc -l)
+            # echo $change_metax_only
+
+            git diff --name-only remotes/origin/${paddle_branch}
+
+            if [ $change_numbers -ne $change_backend ]; then
+              echo "Common file changed, continue to run metax FULL CI test ..."
+            elif [ $paddle_branch -eq 0 ] ; then
+              echo "NO metax backend changes found, skip metax FULL CI ....."
+              exit 0
+            fi
+
+
+            # git submodule update --init --recursive
           fi
 
 
       - name: compile
         run: |
+          # sleep 10000
           cd backends/metax_gpu
           bash build.sh
 
       - name: run test
+
         run: |
           cd backends/metax_gpu/tests
           bash run_test.sh -j 16
diff --git a/Paddle b/Paddle
index cc367e8767d..945ea696575 160000
--- a/Paddle
+++ b/Paddle
@@ -1 +1 @@
-Subproject commit cc367e8767d49819b5100f22e279cd62a1587670
+Subproject commit 945ea69657591c6f702cbb9ccee0d9eefe9bf5f7
diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index e357a5e5912..3e92996f9a2 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -703,9 +703,9 @@ file(
 set(CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS})
 
 set_source_files_properties(${CUSTOM_DEVICE_SRCS} PROPERTIES LANGUAGE CUDA)
-
+set(MACA_PATH $ENV{MACA_PATH})
 set(CMAKE_CUCC_COMPILER "cucc")
-set(CMAKE_CUCC_FLAGS "-I /opt/maca/tools/cu-bridge/include/")
+set(CMAKE_CUCC_FLAGS "-I ${MACA_PATH}/tools/cu-bridge/include/")
 
 add_library(${TARGET_NAME} SHARED ${CUSTOM_DEVICE_SRCS})
 
@@ -734,9 +734,9 @@ target_link_libraries(
   ${WARPRNNT_LIBRARIES}
   ${PADDLE_CORE_LIB})
 
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so)
-target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so)
+target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmccl.so)
+target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcFlashAttn.so)
+target_link_libraries(${TARGET_NAME} ${MACA_PATH}/lib/libmcpti.so)
 
 include_directories(BEFORE ${PADDLE_SOURCE_DIR})
 
diff --git a/backends/metax_gpu/cmake/dgc.cmake b/backends/metax_gpu/cmake/dgc.cmake
index 4c54e636d5e..4c61f2e6bcb 100644
--- a/backends/metax_gpu/cmake/dgc.cmake
+++ b/backends/metax_gpu/cmake/dgc.cmake
@@ -62,8 +62,8 @@ if(EXISTS ${DGC_DOWNLOAD_DIR}/${DGC_CACHE_FILENAME})
 else()
   download_dgc()
 endif()
-
-set(CU_BRIDGE_PATH "/opt/maca/tools/cu-bridge")
+set(MACA_PATH $ENV{MACA_PATH})
+set(CU_BRIDGE_PATH "${MACA_PATH}/tools/cu-bridge")
 
 add_custom_command(
   OUTPUT "${CU_BRIDGE_PATH}/bin/nvcc"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
index 6cdfb2f5242..6c46ef10c0f 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_grad_kernel_register.cu
@@ -119,7 +119,22 @@ void ActivationGradGPUImpl(const Context& dev_ctx,
     ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, &x, nullptr, &dout, dx, functor);              \
   }
-
+#define DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_DOUBLE_ATTRS_DEPX(  \
+    name, functor_class, attr1, attr2)                          \
+  template <typename T, typename Context>                       \
+  void name##GradKernel(const Context& dev_ctx,                 \
+                        const DenseTensor& x,                   \
+                        const DenseTensor& dout,                \
+                        double attr1,                           \
+                        double attr2,                           \
+                        DenseTensor* dx) {                      \
+    funcs::functor_class<T> functor;                            \
+    auto attrs = functor.GetAttrs();                            \
+    *(attrs[0].second) = attr1;                                 \
+    *(attrs[1].second) = attr2;                                 \
+    ActivationGradGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, &x, nullptr, &dout, dx, functor);              \
+  }
 #define DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPOUT(name, functor_class) \
   template <typename T, typename Context>                             \
   void name##GradKernel(const Context& dev_ctx,                       \
@@ -239,10 +254,10 @@ DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(STanh,
                                                scale_a,
                                                scale_b);
 
-DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPX(Softplus,
-                                               CudaSoftplusGradFunctor,
-                                               beta,
-                                               threshold);
+DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_DOUBLE_ATTRS_DEPX(Softplus,
+                                                      CudaSoftplusGradFunctor,
+                                                      beta,
+                                                      threshold);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_TWO_ATTRS_DEPOUT(HardSigmoid,
                                                  CudaHardSigmoidGradFunctor,
                                                  slope,
diff --git a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
index f24f3e8abbc..363932cfc28 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/activation_kernel_register.cu
@@ -90,7 +90,21 @@ void ActivationGPUImpl(const Context& dev_ctx,
     ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
         dev_ctx, x, out, functor);                          \
   }
-
+#define DEFINE_GPU_ACT_KERNEL_WITH_TWO_DOUBLE_ATTRS(        \
+    name, functor_class, attr1, attr2)                      \
+  template <typename T, typename Context>                   \
+  void name##Kernel(const Context& dev_ctx,                 \
+                    const DenseTensor& x,                   \
+                    double attr1,                           \
+                    double attr2,                           \
+                    DenseTensor* out) {                     \
+    funcs::functor_class<T> functor;                        \
+    auto attrs = functor.GetAttrs();                        \
+    *(attrs[0].second) = attr1;                             \
+    *(attrs[1].second) = attr2;                             \
+    ActivationGPUImpl<T, Context, funcs::functor_class<T>>( \
+        dev_ctx, x, out, functor);                          \
+  }
 DEFINE_GPU_ACTIVATION_KERNEL(Cos, CudaCosFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Tan, CudaTanFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Acos, CudaAcosFunctor)
@@ -139,10 +153,10 @@ DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardTanh,
                                      t_min,
                                      t_max)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Stanh, CudaSTanhFunctor, scale_a, scale_b)
-DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(Softplus,
-                                     CudaSoftplusFunctor,
-                                     beta,
-                                     threshold)
+DEFINE_GPU_ACT_KERNEL_WITH_TWO_DOUBLE_ATTRS(Softplus,
+                                            CudaSoftplusFunctor,
+                                            beta,
+                                            threshold)
 DEFINE_GPU_ACT_KERNEL_WITH_TWO_ATTRS(HardSigmoid,
                                      CudaHardSigmoidFunctor,
                                      slope,
diff --git a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
index 4e2a4ce240c..44ac7f2fddc 100644
--- a/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
@@ -115,11 +115,12 @@ void WeightQuantizeKernel(const Context& dev_ctx,
     dev_ctx.template Alloc<T>(scale);
     weight_quant_gpu<T, Context>(dev_ctx,
                                  x.data<T>(),
-                                 quanted_x.data<int8_t>(),
+                                 out->data<int8_t>(),
                                  scale->data<T>(),
                                  weight_shape,
                                  arch,
                                  algo);
+    out->Resize({m, n});
 #ifdef PADDLE_WITH_HIP
     DenseTensor x_int_tmp(out->type());
     x_int_tmp.Resize({m, n / 2});
@@ -133,12 +134,12 @@ void WeightQuantizeKernel(const Context& dev_ctx,
     funcs::Transpose<Context, int8_t, 2> trans;
     trans(dev_ctx, x_int_tmp, out, axis);
 #else
-    weight_permute_gpu<Context>(dev_ctx,
-                                quanted_x.data<int8_t>(),
-                                out->data<int8_t>(),
-                                weight_shape,
-                                arch,
-                                algo);
+    // weight_permute_gpu<Context>(dev_ctx,
+    //                             quanted_x.data<int8_t>(),
+    //                             out->data<int8_t>(),
+    //                             weight_shape,
+    //                             arch,
+    //                             algo);
 #endif
   } else if (algo == "w4a8") {
     weight_permute_gpu_w4a8<Context>(dev_ctx,
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 0c84ada4b65..084b5b8c601 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -49,7 +49,13 @@ foreach(test_name ${TEST_PROGRAMS})
     continue()
   endif()
 
-  set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+  string(FIND "${test_name}" "metax" METAX_SUFFIX_POS)
+  if(NOT METAX_SUFFIX_POS EQUAL -1)
+    set(CURRENT_TEST_PROGRAM ${METAX_UNIT_TEST_PATH}/${test_name}.py)
+  else()
+    set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+  endif()
+
   if(NOT EXISTS ${CURRENT_TEST_PROGRAM})
     message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.")
   else()
diff --git a/backends/metax_gpu/tests/unit_test/test_conv3d_layer_metax.py b/backends/metax_gpu/tests/unit_test/test_conv3d_layer_metax.py
new file mode 100644
index 00000000000..cd4cd290065
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_conv3d_layer_metax.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+from op_test import get_device_place, is_custom_device
+from test_conv3d_op import conv3d_forward_naive
+
+import paddle
+import paddle.base.dygraph as dg
+import paddle.nn.functional as F
+from paddle import base, nn
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+class Conv3DTestCase(unittest.TestCase):
+    def __init__(
+        self,
+        methodName="runTest",
+        batch_size=4,
+        spatial_shape=(8, 8, 8),
+        num_channels=6,
+        num_filters=8,
+        filter_size=3,
+        padding=0,
+        stride=1,
+        dilation=1,
+        groups=1,
+        no_bias=False,
+        data_format="NCDHW",
+        dtype="float32",
+    ):
+        super().__init__(methodName)
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.num_filters = num_filters
+        self.spatial_shape = spatial_shape
+        self.filter_size = filter_size
+
+        self.padding = padding
+        self.stride = stride
+        self.dilation = dilation
+        self.groups = groups
+        self.no_bias = no_bias
+        self.data_format = data_format
+        self.dtype = dtype
+
+    def setUp(self):
+        self.channel_last = self.data_format == "NDHWC"
+        if self.channel_last:
+            input_shape = (
+                self.batch_size,
+                *self.spatial_shape,
+                self.num_channels,
+            )
+        else:
+            input_shape = (
+                self.batch_size,
+                self.num_channels,
+                *self.spatial_shape,
+            )
+        self.input = np.random.randn(*input_shape).astype(self.dtype)
+
+        if isinstance(self.filter_size, int):
+            filter_size = [self.filter_size] * 3
+        else:
+            filter_size = self.filter_size
+        self.weight_shape = weight_shape = (
+            self.num_filters,
+            self.num_channels // self.groups,
+            *filter_size,
+        )
+        self.weight = np.random.uniform(-1, 1, size=weight_shape).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(-1, 1, size=(self.num_filters,)).astype(
+                self.dtype
+            )
+        else:
+            self.bias = None
+
+    def base_layer(self, place):
+        main = base.Program()
+        start = base.Program()
+        with (
+            base.unique_name.guard(),
+            base.program_guard(main, start),
+        ):
+            input_shape = (
+                (-1, -1, -1, -1, self.num_channels)
+                if self.channel_last
+                else (-1, self.num_channels, -1, -1, -1)
+            )
+            x_var = paddle.static.data("input", input_shape, dtype=self.dtype)
+            weight_attr = paddle.nn.initializer.Assign(self.weight)
+            if self.bias is None:
+                bias_attr = False
+            else:
+                bias_attr = paddle.nn.initializer.Assign(self.bias)
+            y_var = paddle.nn.Conv3D(
+                in_channels=self.num_channels,
+                out_channels=self.num_filters,
+                kernel_size=self.filter_size,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                padding_mode="zeros",
+                weight_attr=weight_attr,
+                bias_attr=bias_attr,
+                data_format=self.data_format,
+            )(x_var)
+        feed_dict = {"input": self.input}
+        exe = base.Executor(place)
+        exe.run(start)
+        (y_np,) = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def functional(self, place):
+        main = base.Program()
+        start = base.Program()
+        with (
+            base.unique_name.guard(),
+            base.program_guard(main, start),
+        ):
+            input_shape = (
+                (-1, -1, -1, -1, self.num_channels)
+                if self.channel_last
+                else (-1, self.num_channels, -1, -1, -1)
+            )
+            x_var = paddle.static.data("input", input_shape, dtype=self.dtype)
+            w_var = paddle.static.data("weight", self.weight_shape, dtype=self.dtype)
+            if not self.no_bias:
+                b_var = paddle.static.data(
+                    "bias", (self.num_filters,), dtype=self.dtype
+                )
+            else:
+                b_var = None
+            y_var = F.conv3d(
+                x_var,
+                w_var,
+                b_var,
+                padding=self.padding,
+                stride=self.stride,
+                dilation=self.dilation,
+                groups=self.groups,
+                data_format=self.data_format,
+            )
+        feed_dict = {"input": self.input, "weight": self.weight}
+        if self.bias is not None:
+            feed_dict["bias"] = self.bias
+        exe = base.Executor(place)
+        exe.run(start)
+        (y_np,) = exe.run(main, feed=feed_dict, fetch_list=[y_var])
+        return y_np
+
+    def paddle_nn_layer(self):
+        x_var = paddle.to_tensor(self.input)
+        x_var.stop_gradient = False
+        conv = nn.Conv3D(
+            self.num_channels,
+            self.num_filters,
+            self.filter_size,
+            padding=self.padding,
+            stride=self.stride,
+            dilation=self.dilation,
+            groups=self.groups,
+            data_format=self.data_format,
+        )
+        conv.weight.set_value(self.weight)
+        if not self.no_bias:
+            conv.bias.set_value(self.bias)
+        y_var = conv(x_var)
+        y_var.backward()
+        y_np = y_var.numpy()
+        t1 = x_var.gradient()
+        return y_np, t1
+
+    def _test_pir_equivalence(self, place):
+        with paddle.pir_utils.IrGuard():
+            result1 = self.base_layer(place)
+            result2 = self.functional(place)
+        with dg.guard(place):
+            result3, g1 = self.paddle_nn_layer()
+        np.testing.assert_array_almost_equal(result1, result2)
+        np.testing.assert_array_almost_equal(result2, result3)
+
+    def runTest(self):
+        place = base.CPUPlace()
+        self._test_pir_equivalence(place)
+
+        if base.core.is_compiled_with_cuda() or is_custom_device():
+            place = get_device_place()
+            self._test_pir_equivalence(place)
+
+
+class Conv3DErrorTestCase(Conv3DTestCase):
+    def runTest(self):
+        place = base.CPUPlace()
+        with (
+            dg.guard(place),
+            self.assertRaises(ValueError),
+        ):
+            self.paddle_nn_layer()
+
+
+def add_cases(suite):
+    suite.addTest(Conv3DTestCase(methodName="runTest"))
+    suite.addTest(Conv3DTestCase(methodName="runTest", stride=[1, 2, 1], dilation=2))
+    suite.addTest(Conv3DTestCase(methodName="runTest", stride=2, dilation=(2, 1, 2)))
+    suite.addTest(Conv3DTestCase(methodName="runTest", padding="same", no_bias=True))
+    suite.addTest(
+        Conv3DTestCase(methodName="runTest", filter_size=(3, 2, 3), padding="valid")
+    )
+    suite.addTest(Conv3DTestCase(methodName="runTest", padding=(2, 3, 1)))
+    suite.addTest(Conv3DTestCase(methodName="runTest", padding=[1, 2, 2, 1, 2, 3]))
+    suite.addTest(
+        Conv3DTestCase(
+            methodName="runTest",
+            padding=[[0, 0], [0, 0], [1, 2], [2, 1], [2, 2]],
+        )
+    )
+    suite.addTest(Conv3DTestCase(methodName="runTest", data_format="NDHWC"))
+    suite.addTest(
+        Conv3DTestCase(
+            methodName="runTest",
+            data_format="NDHWC",
+            padding=[[0, 0], [1, 1], [3, 3], [2, 2], [0, 0]],
+        )
+    )
+    suite.addTest(Conv3DTestCase(methodName="runTest", groups=2, padding="valid"))
+    suite.addTest(
+        Conv3DTestCase(
+            methodName="runTest",
+            num_filters=6,
+            num_channels=3,
+            groups=3,
+            padding="valid",
+        )
+    )
+
+
+def add_error_cases(suite):
+    suite.addTest(Conv3DErrorTestCase(methodName="runTest", num_channels=5, groups=2))
+    suite.addTest(
+        Conv3DErrorTestCase(
+            methodName="runTest", num_channels=5, groups=2, padding=[-1, 1, 3]
+        )
+    )
+
+
+def load_tests(loader, standard_tests, pattern):
+    suite = unittest.TestSuite()
+    add_cases(suite)
+    add_error_cases(suite)
+    return suite
+
+
+def get_places():
+    places = []
+    if core.is_compiled_with_xpu():
+        places.append(paddle.device.XPUPlace(0))
+    elif core.is_compiled_with_cuda():
+        places.append(paddle.CUDAPlace(0))
+    places.append(paddle.CPUPlace())
+    return places
+
+
+class TestConv3dAPI_Compatibility(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(2025)
+        self.places = get_places()
+        self.shape_x = [2, 3, 8, 8, 8]  # NCDHW
+        self.shape_w = [6, 3, 3, 3, 3]  # Co, Cin, kD, kH, kW
+        self.dtype = "float32"
+        self.init_data()
+
+    def init_data(self):
+        self.np_x = np.random.rand(*self.shape_x).astype(self.dtype)
+        self.np_w = np.random.rand(*self.shape_w).astype(self.dtype)
+        conv_param = {
+            "stride": [1, 1, 1],
+            "pad": [0, 0, 0],
+            "dilation": [1, 1, 1],
+        }
+        self.np_ref_out = conv3d_forward_naive(self.np_x, self.np_w, 1, conv_param)
+
+    def test_dygraph_Compatibility(self):
+        for place in self.places:
+            paddle.device.set_device(place)
+            paddle.disable_static()
+            x = paddle.to_tensor(self.np_x)
+            w = paddle.to_tensor(self.np_w)
+
+            paddle_dygraph_out = []
+            # Position args (args)
+            out1 = paddle.nn.functional.conv3d(x, w)
+            paddle_dygraph_out.append(out1)
+            # Key words args (kwargs) for paddle
+            out2 = paddle.nn.functional.conv3d(x=x, weight=w)
+            paddle_dygraph_out.append(out2)
+            # Key words args for alias compatibility
+            out3 = paddle.nn.functional.conv3d(input=x, weight=w)
+            paddle_dygraph_out.append(out3)
+            # Combined args and kwargs
+            out4 = paddle.nn.functional.conv3d(x, weight=w)
+            paddle_dygraph_out.append(out4)
+
+            # refer to test/xpu/test_conv3d_op_xpu.py
+            if isinstance(place, core.XPUPlace):
+                rtol = 5e-3
+                atol = 5e-3
+            else:
+                rtol = 1e-5
+                atol = 0
+
+            # Check all dygraph results against reference
+            for out in paddle_dygraph_out:
+                np.testing.assert_allclose(
+                    self.np_ref_out, out.numpy(), rtol=rtol, atol=atol
+                )
+            paddle.enable_static()
+
+    def test_static_Compatibility(self):
+        paddle.enable_static()
+
+        fetch_list = []
+        main = paddle.static.Program()
+        startup = paddle.static.Program()
+        with base.program_guard(main, startup):
+            x = paddle.static.data(name="x", shape=self.shape_x, dtype=self.dtype)
+            w = paddle.static.data(name="w", shape=self.shape_w, dtype=self.dtype)
+
+            # Position args (args)
+            out1 = paddle.nn.functional.conv3d(x, w)
+            fetch_list.append(out1)
+            # Key words args (kwargs) for paddle
+            out2 = paddle.nn.functional.conv3d(x=x, weight=w)
+            fetch_list.append(out2)
+            # Key words args for alias compatibility
+            out3 = paddle.nn.functional.conv3d(input=x, weight=w)
+            fetch_list.append(out3)
+            # Combined args and kwargs
+            out4 = paddle.nn.functional.conv3d(x, weight=w)
+            fetch_list.append(out4)
+
+            for place in self.places:
+                # refer to test/xpu/test_conv2d_op_xpu.py
+                if isinstance(place, core.XPUPlace):
+                    rtol = 5e-3
+                    atol = 5e-3
+                else:
+                    rtol = 1e-5
+                    atol = 0
+
+                exe = base.Executor(place)
+                fetches = exe.run(
+                    main,
+                    feed={"x": self.np_x, "w": self.np_w},
+                    fetch_list=fetch_list,
+                )
+                for out in fetches:
+                    np.testing.assert_allclose(
+                        out, self.np_ref_out, rtol=rtol, atol=atol
+                    )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_op_metax.py b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_op_metax.py
new file mode 100644
index 00000000000..6f55aac3361
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_op_metax.py
@@ -0,0 +1,764 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+
+paddle.enable_static()
+from op_test import (
+    OpTest,
+    copy_bits_from_float_to_uint16,
+    get_device_place,
+    is_custom_device,
+)
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+def convert_float_to_uint16(float_list, data_format="NCHW"):
+    if data_format == "NHWC":
+        float_list = np.transpose(float_list, [0, 4, 1, 2, 3])
+
+    new_output = []
+    for x in np.nditer(float_list):
+        new_output.append(np.uint16(copy_bits_from_float_to_uint16(x)))
+    new_output = np.reshape(new_output, float_list.shape).view(np.uint16)
+
+    if data_format == "NHWC":
+        new_output = np.transpose(new_output, [0, 2, 3, 4, 1])
+    return new_output
+
+
+def conv3dtranspose_forward_naive(input_, filter_, attrs):
+    padding_algorithm = attrs["padding_algorithm"]
+    if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]:
+        raise ValueError(
+            f"Unknown Attr(padding_algorithm): '{padding_algorithm}'. "
+            "It can only be 'SAME' or 'VALID'."
+        )
+
+    if attrs["data_format"] == "NHWC":
+        input_ = np.transpose(input_, [0, 4, 1, 2, 3])
+    in_n, in_c, in_d, in_h, in_w = input_.shape
+    f_c, f_out_c, f_d, f_h, f_w = filter_.shape
+    groups = attrs["groups"]
+    assert in_c == f_c
+    out_c = f_out_c * groups
+    sub_in_c = in_c // groups
+
+    stride, pad, dilations = (
+        attrs["strides"],
+        attrs["paddings"],
+        attrs["dilations"],
+    )
+
+    def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
+        padding = []
+        for input_size, filter_size, stride_size in zip(
+            input_shape, kernel_size, kernel_stride
+        ):
+            out_size = int((input_size + stride_size - 1) / stride_size)
+            pad_sum = np.max(
+                ((out_size - 1) * stride_size + filter_size - input_size, 0)
+            )
+            pad_0 = int(pad_sum / 2)
+            pad_1 = int(pad_sum - pad_0)
+            padding.append(pad_0)
+            padding.append(pad_1)
+        return padding
+
+    ksize = filter_.shape[2:5]
+    if padding_algorithm == "VALID":
+        pad = [0, 0, 0, 0, 0, 0]
+    elif padding_algorithm == "SAME":
+        dilations = [1, 1, 1]
+        input_data_shape = input_.shape[2:5]
+        pad = _get_padding_with_SAME(input_data_shape, ksize, stride)
+
+    pad_d_0, pad_d_1 = pad[0], pad[0]
+    pad_h_0, pad_h_1 = pad[1], pad[1]
+    pad_w_0, pad_w_1 = pad[2], pad[2]
+    if len(pad) == 6:
+        pad_d_0, pad_d_1 = pad[0], pad[1]
+        pad_h_0, pad_h_1 = pad[2], pad[3]
+        pad_w_0, pad_w_1 = pad[4], pad[5]
+
+    d_block_d = dilations[0] * (f_d - 1) + 1
+    d_block_h = dilations[1] * (f_h - 1) + 1
+    d_block_w = dilations[2] * (f_w - 1) + 1
+    out_d = (in_d - 1) * stride[0] + d_block_d
+    out_h = (in_h - 1) * stride[1] + d_block_h
+    out_w = (in_w - 1) * stride[2] + d_block_w
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    for n in range(in_n):
+        for d in range(in_d):
+            for i in range(in_h):
+                for j in range(in_w):
+                    for g in range(groups):
+                        input_masked = input_[
+                            n, g * sub_in_c : (g + 1) * sub_in_c, d, i, j
+                        ]  # (c)
+                        input_masked = np.reshape(input_masked, (sub_in_c, 1, 1, 1))
+                        input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
+
+                        for k in range(f_out_c):
+                            tmp_out = np.sum(
+                                input_masked
+                                * filter_[
+                                    g * sub_in_c : (g + 1) * sub_in_c,
+                                    k,
+                                    :,
+                                    :,
+                                    :,
+                                ],
+                                axis=0,
+                            )
+                            d1, d2 = d * stride[0], d * stride[0] + d_block_d
+                            i1, i2 = i * stride[1], i * stride[1] + d_block_h
+                            j1, j2 = j * stride[2], j * stride[2] + d_block_w
+                            out[
+                                n,
+                                g * f_out_c + k,
+                                d1 : d2 : dilations[0],
+                                i1 : i2 : dilations[1],
+                                j1 : j2 : dilations[2],
+                            ] += tmp_out
+
+    out = out[
+        :,
+        :,
+        pad_d_0 : out_d - pad_d_1,
+        pad_h_0 : out_h - pad_h_1,
+        pad_w_0 : out_w - pad_w_1,
+    ]
+    if attrs["data_format"] == "NHWC":
+        out = np.transpose(out, [0, 2, 3, 4, 1])
+    return out
+
+
+def create_test_cudnn_fp16_class(parent, grad_check=True):
+    @unittest.skipIf(
+        not (core.is_compiled_with_cuda() or is_custom_device()),
+        "core is not compiled with CUDA",
+    )
+    class TestConv3DTransposeCUDNNFP16(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda() or is_custom_device():
+                place = get_device_place()
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=2e-2)
+
+        def test_check_grad_no_filter(self):
+            place = get_device_place()
+            if core.is_float16_supported(place) and grad_check:
+                self.check_grad_with_place(
+                    place, ["Input"], "Output", no_grad_set={"Filter"}
+                )
+
+        def test_check_grad_no_input(self):
+            place = get_device_place()
+            if core.is_float16_supported(place) and grad_check:
+                self.check_grad_with_place(
+                    place, ["Filter"], "Output", no_grad_set={"Input"}
+                )
+
+    cls_name = "{}_{}".format(parent.__name__, "CUDNNFP16OP")
+    TestConv3DTransposeCUDNNFP16.__name__ = cls_name
+    globals()[cls_name] = TestConv3DTransposeCUDNNFP16
+
+
+def create_test_cudnn_bf16_class(parent):
+    @unittest.skipIf(
+        not (core.is_compiled_with_cuda() or is_custom_device())
+        or not core.is_bfloat16_supported(get_device_place()),
+        "core is not compiled with CUDA and do not support bfloat16",
+    )
+    class TestConv3DTransposeCUDNNBF16(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = get_device_place()
+            self.check_output_with_place(place)
+
+        def test_check_grad(self):
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                {"Input", "Filter"},
+                "Output",
+            )
+
+        def test_check_grad_no_filter(self):
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Input"],
+                "Output",
+                no_grad_set={"Filter"},
+            )
+
+        def test_check_grad_no_input(self):
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Filter"],
+                "Output",
+                no_grad_set={"Input"},
+            )
+
+    cls_name = "{}_{}".format(parent.__name__, "CUDNNBF16OP")
+    TestConv3DTransposeCUDNNBF16.__name__ = cls_name
+    globals()[cls_name] = TestConv3DTransposeCUDNNBF16
+
+
+def conv3d_transpose_wrapper(
+    x,
+    weight,
+    stride=1,
+    padding=0,
+    output_padding=[],
+    output_size=[],
+    padding_algorithm="EXPLICIT",
+    groups=1,
+    dilation=1,
+    data_format="NCDHW",
+):
+    if data_format == "AnyLayout":
+        data_format = "NCDHW"
+    return paddle._C_ops.conv3d_transpose(
+        x,
+        weight,
+        stride,
+        padding,
+        output_padding,
+        output_size,
+        padding_algorithm,
+        groups,
+        dilation,
+        data_format,
+    )
+
+
+class TestConv3DTransposeOp(OpTest):
+    def setUp(self):
+        # init as conv transpose
+        self.use_cudnn = False
+        self.check_no_input = False
+        self.check_no_filter = False
+        self.data_format = "NCHW"
+        self.pad = [0, 0, 0]
+        self.padding_algorithm = "EXPLICIT"
+        self.init_op_type()
+        self.init_kernel_type()
+        self.init_test_case()
+
+        if self.is_bfloat16_op():
+            input = np.random.random(self.input_size).astype(np.float32)
+            filter = np.random.random(self.filter_size).astype(np.float32)
+        else:
+            input = np.random.random(self.input_size).astype(self.dtype)
+            filter = np.random.random(self.filter_size).astype(self.dtype)
+
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "padding_algorithm": self.padding_algorithm,
+            "dilations": self.dilations,
+            "groups": self.groups,
+            "use_cudnn": self.use_cudnn,
+            "data_format": self.data_format,
+        }
+
+        output = conv3dtranspose_forward_naive(input, filter, self.attrs).astype(
+            "float32"
+        )
+
+        if self.is_bfloat16_op():
+            self.inputs = {
+                "Input": convert_float_to_uint16(input),
+                "Filter": convert_float_to_uint16(filter),
+            }
+        else:
+            self.inputs = {
+                "Input": input,
+                "Filter": filter,
+            }
+            output = output.astype(self.dtype)
+
+        self.outputs = {"Output": output}
+
+    def test_check_output(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
+
+    def test_check_grad(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                {"Input", "Filter"},
+                "Output",
+                max_relative_error=0.03,
+            )
+        else:
+            self.check_grad({"Input", "Filter"}, "Output", max_relative_error=0.03)
+
+    def test_check_grad_no_filter(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Input"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Filter"},
+            )
+        elif self.check_no_filter:
+            self.check_grad(
+                ["Input"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Filter"},
+            )
+
+    def test_check_grad_no_input(self):
+        if self.use_cudnn:
+            place = get_device_place()
+            self.check_grad_with_place(
+                place,
+                ["Filter"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Input"},
+            )
+        elif self.check_no_input:
+            self.check_grad(
+                ["Filter"],
+                "Output",
+                max_relative_error=0.03,
+                no_grad_set={"Input"},
+            )
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+    def init_kernel_type(self):
+        self.dtype = np.float32
+
+
+class TestWithSymmetricPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_input = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithAsymmetricPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 0, 1, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithSAMEPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.stride = [1, 1, 2]
+        self.dilations = [1, 2, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 6]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 4]
+        self.padding_algorithm = "SAME"
+
+
+class TestWithVALIDPad(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.stride = [2, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 4, 3]
+        self.padding_algorithm = "VALID"
+
+
+class TestWithStride(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_filter = True
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithGroups(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [1, 2, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+
+class TestWithDilation(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class Test_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+# ------------ test_cudnn ------------
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNN(TestConv3DTransposeOp):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithSymmetricPad(TestWithSymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithAsymmetricPad(TestWithAsymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1, 0, 0, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 4, 4, 4]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithSAMEPad(TestWithSAMEPad):
+    def init_test_case(self):
+        self.stride = [1, 1, 2]
+        self.dilations = [1, 2, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 4, 3]
+        self.padding_algorithm = "SAME"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithVALIDPad(TestWithVALIDPad):
+    def init_test_case(self):
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.padding_algorithm = "VALID"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 2, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithGroups(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [1, 2, 5, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+        # Please Don't remove the following code.
+        # Currently, CI use cudnn V5.0 which not support dilation conv.
+        # class TestCUDNNWithDilation(TestWithDilation):
+        #     def init_test_case(self):
+        #         self.pad = [1, 1, 1]
+        #         self.stride = [2, 2, 2]
+        #         self.dilations = [2, 2, 2]
+        #         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        #         f_c = self.input_size[1]
+        #         self.filter_size = [f_c, 6, 3, 3, 3]
+        #
+        #     def init_op_type(self):
+        #         self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNN_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithSymmetricPad_NHWC(TestWithSymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithAsymmetricPad_NHWC(TestWithAsymmetricPad):
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 0, 0, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithStride_NHWC(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+@unittest.skipIf(
+    not (core.is_compiled_with_cuda() or is_custom_device()),
+    "core is not compiled with CUDA",
+)
+class TestCUDNNWithGroups_NHWC(TestWithGroups):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [1, 5, 5, 5, 2]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+        self.data_format = "NHWC"
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+        self.python_api = conv3d_transpose_wrapper
+
+
+# ----------------Conv3DTransposeCUDNN fp16----------------
+create_test_cudnn_fp16_class(TestConv3DTransposeOp)
+create_test_cudnn_fp16_class(TestWithSymmetricPad)
+create_test_cudnn_fp16_class(TestWithAsymmetricPad)
+create_test_cudnn_fp16_class(TestWithSAMEPad)
+create_test_cudnn_fp16_class(TestWithVALIDPad)
+create_test_cudnn_fp16_class(TestWithStride)
+create_test_cudnn_fp16_class(TestWithGroups)
+create_test_cudnn_fp16_class(TestWithDilation)
+create_test_cudnn_fp16_class(Test_NHWC)
+
+
+# ----------------Conv3DTransposeCUDNN bf16----------------
+create_test_cudnn_bf16_class(TestConv3DTransposeOp)
+create_test_cudnn_bf16_class(TestWithSymmetricPad)
+create_test_cudnn_bf16_class(TestWithAsymmetricPad)
+create_test_cudnn_bf16_class(TestWithSAMEPad)
+create_test_cudnn_bf16_class(TestWithVALIDPad)
+create_test_cudnn_bf16_class(TestWithStride)
+create_test_cudnn_bf16_class(TestWithGroups)
+create_test_cudnn_bf16_class(TestWithDilation)
+create_test_cudnn_bf16_class(Test_NHWC)
+
+
+class TestConv3dTranspose(unittest.TestCase):
+    def error_weight_input(self):
+        array = np.array([1], dtype=np.float32)
+        x = paddle.to_tensor(np.reshape(array, [1, 1, 1, 1, 1]), dtype="float32")
+        weight = paddle.to_tensor(np.reshape(array, [1]), dtype="float32")
+        paddle.nn.functional.conv3d_transpose(x, weight, bias=0)
+
+    def test_type_error(self):
+        self.assertRaises(ValueError, self.error_weight_input)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_part2_op_metax.py b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_part2_op_metax.py
new file mode 100644
index 00000000000..9bf91f5908f
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_conv3d_transpose_part2_op_metax.py
@@ -0,0 +1,108 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+
+sys.path.append("../../legacy_test")
+from test_conv3d_transpose_op import (
+    TestConv3DTransposeOp,
+    create_test_cudnn_bf16_class,
+    create_test_cudnn_fp16_class,
+)
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+class TestWithSymmetricPad_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithAsymmetricPad_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 0, 1, 0, 1, 2]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithGroups_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_filter = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.groups = 2
+        self.input_size = [2, 5, 5, 5, 4]  # NDHWC
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 3, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithStride_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NCDHW
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+class TestWithDilation_NHWC(TestConv3DTransposeOp):
+    def init_test_case(self):
+        self.check_no_input = True
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.groups = 1
+        self.input_size = [2, 5, 5, 5, 3]  # NCDHW
+        f_c = self.input_size[-1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+        self.data_format = "NHWC"
+
+
+# ----------------Conv3DTransposeCUDNN fp16----------------
+create_test_cudnn_fp16_class(TestWithSymmetricPad_NHWC)
+create_test_cudnn_fp16_class(TestWithAsymmetricPad_NHWC)
+create_test_cudnn_fp16_class(TestWithGroups_NHWC)
+create_test_cudnn_fp16_class(TestWithStride_NHWC)
+create_test_cudnn_fp16_class(TestWithDilation_NHWC)
+
+
+# ----------------Conv3DTransposeCUDNN bf16----------------
+create_test_cudnn_bf16_class(TestWithSymmetricPad_NHWC)
+create_test_cudnn_bf16_class(TestWithAsymmetricPad_NHWC)
+create_test_cudnn_bf16_class(TestWithGroups_NHWC)
+create_test_cudnn_bf16_class(TestWithStride_NHWC)
+create_test_cudnn_bf16_class(TestWithDilation_NHWC)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_deform_conv2d_metax.py b/backends/metax_gpu/tests/unit_test/test_deform_conv2d_metax.py
new file mode 100644
index 00000000000..da5eeb34d0b
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_deform_conv2d_metax.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from unittest import TestCase
+
+import numpy as np
+from op_test import get_device_place, is_custom_device
+
+import paddle
+import paddle.nn.initializer as I
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+
+class TestDeformConv2D(TestCase):
+    batch_size = 4
+    spatial_shape = (5, 5)
+    dtype = "float32"
+
+    def setUp(self):
+        self.in_channels = 2
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [0, 0]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = True
+
+    def prepare(self):
+        np.random.seed(1)
+        paddle.seed(1)
+        if isinstance(self.kernel_size, int):
+            filter_shape = (self.kernel_size,) * 2
+        else:
+            filter_shape = tuple(self.kernel_size)
+        self.filter_shape = filter_shape
+
+        self.weight = np.random.uniform(
+            -1,
+            1,
+            (self.out_channels, self.in_channels // self.groups, *filter_shape),
+        ).astype(self.dtype)
+        if not self.no_bias:
+            self.bias = np.random.uniform(-1, 1, (self.out_channels,)).astype(
+                self.dtype
+            )
+
+        def out_size(in_size, pad_size, dilation_size, kernel_size, stride_size):
+            return (
+                in_size + 2 * pad_size - (dilation_size * (kernel_size - 1) + 1)
+            ) / stride_size + 1
+
+        out_h = int(
+            out_size(
+                self.spatial_shape[0],
+                self.padding[0],
+                self.dilation[0],
+                self.kernel_size[0],
+                self.stride[0],
+            )
+        )
+        out_w = int(
+            out_size(
+                self.spatial_shape[1],
+                self.padding[1],
+                self.dilation[1],
+                self.kernel_size[1],
+                self.stride[1],
+            )
+        )
+        out_shape = (out_h, out_w)
+
+        self.input_shape = (
+            self.batch_size,
+            self.in_channels,
+            *self.spatial_shape,
+        )
+
+        self.offset_shape = (
+            self.batch_size,
+            self.deformable_groups * 2 * filter_shape[0] * filter_shape[1],
+            *out_shape,
+        )
+
+        self.mask_shape = (
+            self.batch_size,
+            self.deformable_groups * filter_shape[0] * filter_shape[1],
+            *out_shape,
+        )
+
+        self.input = np.random.uniform(-1, 1, self.input_shape).astype(self.dtype)
+
+        self.offset = np.random.uniform(-1, 1, self.offset_shape).astype(self.dtype)
+
+        self.mask = np.random.uniform(-1, 1, self.mask_shape).astype(self.dtype)
+
+    def static_graph_case_dcn(self):
+        main = paddle.static.Program()
+        start = paddle.static.Program()
+        paddle.enable_static()
+        with paddle.static.program_guard(main, start):
+            x = paddle.static.data(
+                "input", (-1, self.in_channels, -1, -1), dtype=self.dtype
+            )
+            offset = paddle.static.data(
+                "offset",
+                (
+                    -1,
+                    self.deformable_groups
+                    * 2
+                    * self.filter_shape[0]
+                    * self.filter_shape[1],
+                    -1,
+                    -1,
+                ),
+                dtype=self.dtype,
+            )
+            mask = paddle.static.data(
+                "mask",
+                (
+                    -1,
+                    self.deformable_groups
+                    * self.filter_shape[0]
+                    * self.filter_shape[1],
+                    -1,
+                    -1,
+                ),
+                dtype=self.dtype,
+            )
+
+            y_v1 = paddle.vision.ops.DeformConv2D(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=self.deformable_groups,
+                weight_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias),
+            )(x, offset, None)
+
+            y_v2 = paddle.vision.ops.DeformConv2D(
+                in_channels=self.in_channels,
+                out_channels=self.out_channels,
+                kernel_size=self.filter_shape,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=self.deformable_groups,
+                weight_attr=I.Assign(self.weight),
+                bias_attr=False if self.no_bias else I.Assign(self.bias),
+            )(x, offset, mask)
+
+        exe = paddle.static.Executor(self.place)
+        exe.run(start)
+        out_v1, out_v2 = exe.run(
+            main,
+            feed={
+                "input": self.input,
+                "offset": self.offset,
+                "mask": self.mask,
+            },
+            fetch_list=[y_v1, y_v2],
+        )
+        return out_v1, out_v2
+
+    def dygraph_case_dcn(self):
+        paddle.disable_static()
+        x = paddle.to_tensor(self.input)
+        offset = paddle.to_tensor(self.offset)
+        mask = paddle.to_tensor(self.mask)
+
+        bias = None if self.no_bias else paddle.to_tensor(self.bias)
+
+        deform_conv2d = paddle.vision.ops.DeformConv2D(
+            in_channels=self.in_channels,
+            out_channels=self.out_channels,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            deformable_groups=self.deformable_groups,
+            groups=self.groups,
+            weight_attr=I.Assign(self.weight),
+            bias_attr=False if self.no_bias else I.Assign(self.bias),
+        )
+
+        y_v1 = deform_conv2d(x, offset)
+        y_v2 = deform_conv2d(x, offset, mask)
+
+        out_v1 = y_v1.numpy()
+        out_v2 = y_v2.numpy()
+
+        return out_v1, out_v2
+
+    def _test_identity(self):
+        self.prepare()
+        static_dcn_v1, static_dcn_v2 = self.static_graph_case_dcn()
+        dy_dcn_v1, dy_dcn_v2 = self.dygraph_case_dcn()
+        np.testing.assert_array_almost_equal(static_dcn_v1, dy_dcn_v1)
+        np.testing.assert_array_almost_equal(static_dcn_v2, dy_dcn_v2)
+
+    def test_identity(self):
+        self.place = paddle.CPUPlace()
+        self._test_identity()
+
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            self.place = get_device_place()
+            self._test_identity()
+
+
+# testcases for DeformConv2D
+class TestDeformConv2DWithPadding(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = True
+
+
+class TestDeformConv2DWithBias(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [2, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithAsynPadding(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 2]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithDilation(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [3, 3]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithStride(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 3
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [2, 2]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithDeformable_Groups(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 5
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 5
+        self.groups = 1
+        self.no_bias = False
+
+
+class TestDeformConv2DWithGroups(TestDeformConv2D):
+    def setUp(self):
+        self.in_channels = 5
+        self.out_channels = 5
+        self.kernel_size = [3, 3]
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.deformable_groups = 1
+        self.groups = 5
+        self.no_bias = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_deformable_conv_op_metax.py b/backends/metax_gpu/tests/unit_test/test_deformable_conv_op_metax.py
new file mode 100644
index 00000000000..1f26abb73f8
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_deformable_conv_op_metax.py
@@ -0,0 +1,504 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from itertools import product
+
+import numpy as np
+from op_test import OpTest
+
+import paddle
+
+paddle.enable_static()
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+
+def dmc_bilinear(data_im, height, width, h, w):
+    h_low = int(np.floor(h))
+    w_low = int(np.floor(w))
+    h_high = h_low + 1
+    w_high = w_low + 1
+
+    lh = h - h_low
+    lw = w - w_low
+    hh = 1 - lh
+    hw = 1 - lw
+
+    v1 = 0
+    if h_low >= 0 and w_low >= 0:
+        v1 = data_im[h_low, w_low]
+    v2 = 0
+    if h_low >= 0 and w_high <= width - 1:
+        v2 = data_im[h_low, w_high]
+    v3 = 0
+    if h_high <= height - 1 and w_low >= 0:
+        v3 = data_im[h_high, w_low]
+    v4 = 0
+    if h_high <= height - 1 and w_high <= width - 1:
+        v4 = data_im[h_high, w_high]
+
+    w1, w2, w3, w4 = hh * hw, hh * lw, lh * hw, lh * lw
+    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+
+    return val
+
+
+def dconv_im2col_gemm(input, offset, mask, filter, group, conv_param):
+    in_n, in_c, in_h, in_w = input.shape
+    out_c, f_c, f_h, f_w = filter.shape
+
+    assert offset.shape == (in_n, 2 * f_h * f_w, in_h, in_w)
+    assert mask.shape == (in_n, f_h * f_w, in_h, in_w)
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+
+    stride, pad, dilation = (
+        conv_param["stride"],
+        conv_param["pad"],
+        conv_param["dilation"],
+    )
+    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
+    assert out_h == in_h
+    assert out_w == in_w
+
+    col_buffer = np.zeros((in_n, in_c * f_h * f_w, in_h * in_w))
+    for n, c, h, w, kh, kw in product(
+        range(in_n),
+        range(in_c),
+        range(out_h),
+        range(out_w),
+        range(f_h),
+        range(f_w),
+    ):
+        offset_h_table = offset[n, ::2, h, w].reshape(f_h, f_w)
+        offset_w_table = offset[n, 1::2, h, w].reshape(f_h, f_w)
+        mask_table = mask[n, :, h, w].reshape(f_h, f_w)
+        offset_h = offset_h_table[kh, kw]
+        offset_w = offset_w_table[kh, kw]
+        val = 0
+        im_h = h * stride[0] + kh * dilation[0] + offset_h - pad[0]
+        im_w = w * stride[0] + kw * dilation[0] + offset_w - pad[1]
+        if im_h > -1 and im_w > -1 and im_h < in_h and im_w < in_h:
+            val = dmc_bilinear(input[n, c], in_h, in_w, im_h, im_w)
+        val_out = val * mask_table[kh, kw]
+        col_buffer[n, c * f_h * f_w + kh * f_w + kw, h * in_w + w] = val_out
+
+    out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
+    weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
+    col_buffer = col_buffer.reshape(
+        (in_n, group, int(in_c // group * f_h * f_w), in_h * in_w)
+    )
+    for n in range(in_n):
+        for g in range(group):
+            out[n, g] = np.matmul(weight[g], col_buffer[n, g])
+    out = out.reshape(in_n, out_c, out_h, out_w)
+    return out
+
+
+def deform_conv2d_wrapper(
+    x,
+    offset,
+    weight,
+    mask=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    deformable_groups=1,
+    groups=1,
+    im2col_step=1,
+):
+    return paddle.vision.ops.deform_conv2d(
+        x,
+        offset,
+        weight,
+        None,
+        stride,
+        padding,
+        dilation,
+        deformable_groups,
+        groups,
+        mask,
+    )
+
+
+class TestModulatedDeformableConvOp(OpTest):
+    def setUp(self):
+        self.python_api = deform_conv2d_wrapper
+        self.op_type = "deformable_conv"
+        self.init_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv_param = {
+            "stride": self.stride,
+            "pad": self.pad,
+            "dilation": self.dilations,
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        offset = 10 * np.random.random(self.offset_size).astype(self.dtype)
+        mask = 10 * np.random.random(self.mask_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
+
+        output = dconv_im2col_gemm(input, offset, mask, filter, self.groups, conv_param)
+        output = output.astype(self.dtype)
+
+        self.inputs = {
+            "Input": OpTest.np_dtype_to_base_dtype(input),
+            "Offset": OpTest.np_dtype_to_base_dtype(offset),
+            "Mask": OpTest.np_dtype_to_base_dtype(mask),
+            "Filter": OpTest.np_dtype_to_base_dtype(filter),
+        }
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "groups": self.groups,
+            "deformable_groups": self.deformable_groups,
+            "im2col_step": self.im2col_step,
+            "dilations": self.dilations,
+        }
+        self.outputs = {"Output": output}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(
+            {"Input", "Offset", "Mask", "Filter"},
+            "Output",
+            max_relative_error=0.05,
+            check_pir=True,
+        )
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 8, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_type(self):
+        self.dtype = np.float32
+
+
+class TestWithStride(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [3, 3]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithDilation(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [1, 1]
+        self.input_size = [4, 3, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+
+class TestWith3x3(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithGroup(TestModulatedDeformableConvOp):
+    def init_group(self):
+        self.groups = 2
+
+
+class TestWithDouble(TestModulatedDeformableConvOp):
+    def init_type(self):
+        self.dtype = np.float64
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 6, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestModulatedDeformableConvInvalidInput(unittest.TestCase):
+    def test_error_api(self):
+        def test_invalid_input():
+            paddle.enable_static()
+            input = [1, 3, 32, 32]
+            offset = paddle.static.data(
+                name="offset", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            mask = paddle.static.data(
+                name="mask", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            loss = paddle.vision.ops.DeformConv2D(
+                in_channels=input[1], out_channels=4, kernel_size=1
+            )(input, offset, mask)
+
+        self.assertRaises(TypeError, test_invalid_input)
+
+        def test_invalid_offset():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input", shape=[None, 3, 32, 32], dtype="int32"
+            )
+            offset = paddle.static.data(
+                name="offset", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            mask = paddle.static.data(
+                name="mask", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            loss = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1], out_channels=4, kernel_size=1
+            )(input, offset, mask)
+
+        self.assertRaises(TypeError, test_invalid_offset)
+
+        def test_invalid_groups():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input_groups", shape=[1, 1, 1, 1], dtype="float32"
+            )
+            offset = paddle.static.data(
+                name="offset_groups", shape=[1, 1], dtype="float32"
+            )
+            mask = paddle.static.data(name="mask_groups", shape=[1], dtype="float32")
+            loss = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1],
+                out_channels=1,
+                kernel_size=1,
+                padding=1,
+                groups=0,
+            )(input, offset, mask)
+
+        self.assertRaises(ZeroDivisionError, test_invalid_groups)
+
+
+class TestDeformConv2DAPI(unittest.TestCase):
+    def test_api(self):
+        def test_deform_conv2d_v1():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input_v1", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            offset = paddle.static.data(
+                name="offset_v1", shape=[None, 4, 32, 32], dtype="float32"
+            )
+            out = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1], out_channels=4, kernel_size=1
+            )(input, offset, None)
+            assert tuple(out.shape) == (-1, 4, 32, 32)
+
+        test_deform_conv2d_v1()
+
+        def test_deform_conv2d_v2():
+            paddle.enable_static()
+            input = paddle.static.data(
+                name="input_v2", shape=[None, 3, 32, 32], dtype="float32"
+            )
+            offset = paddle.static.data(
+                name="offset_v2", shape=[None, 4, 32, 32], dtype="float32"
+            )
+            mask = paddle.static.data(
+                name="mask_v2", shape=[None, 2, 32, 32], dtype="float32"
+            )
+            out = paddle.vision.ops.DeformConv2D(
+                in_channels=input.shape[1], out_channels=4, kernel_size=1
+            )(input, offset, mask)
+
+            assert tuple(out.shape) == (-1, 4, 32, 32)
+
+        test_deform_conv2d_v2()
+
+
+class TestModulatedDeformableConvOp_ZeroSize(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        # 0-size
+        self.input_size = [0, 8, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        mask_c = self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+        self.mask_size = [
+            self.input_size[0],
+            mask_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestDeformConv2DAPI_CPU_FP16(unittest.TestCase):
+    def setUp(self):
+        self.padding = [1, 1]
+        self.stride = [1, 1]
+        self.dilation = [1, 1]
+        self.groups = 1
+        self.data_format = "NCL"
+
+    def test_cpu_fp16(self):
+        with paddle.base.dygraph.guard(paddle.CPUPlace()):
+            x = paddle.ones([4, 5, 5, 5])
+            offset = paddle.ones([4, 90, 5, 5]).astype(paddle.float16)
+            weight = paddle.ones([5, 5, 3, 3]).astype(paddle.float16)
+            bias = paddle.ones([5]).astype(paddle.float16)
+            mask = paddle.ones([4, 45, 5, 5]).astype(paddle.float16)
+
+            # If there is an error, an error will be thrown.
+            out = paddle.vision.ops.deform_conv2d(
+                x,
+                offset,
+                weight,
+                bias,
+                stride=self.stride,
+                padding=self.padding,
+                dilation=self.dilation,
+                groups=self.groups,
+                deformable_groups=5,
+                mask=mask,
+            )
+            np.testing.assert_allclose(out.shape, [4, 5, 5, 5])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_deformable_conv_v1_op_metax.py b/backends/metax_gpu/tests/unit_test/test_deformable_conv_v1_op_metax.py
new file mode 100644
index 00000000000..6a4244db267
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_deformable_conv_v1_op_metax.py
@@ -0,0 +1,319 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from itertools import product
+
+import numpy as np
+from op_test import OpTest
+
+import paddle
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+
+def dmc_bilinear(data_im, height, width, h, w):
+    h_low = int(np.floor(h))
+    w_low = int(np.floor(w))
+    h_high = h_low + 1
+    w_high = w_low + 1
+
+    lh = h - h_low
+    lw = w - w_low
+    hh = 1 - lh
+    hw = 1 - lw
+
+    v1 = 0
+    if h_low >= 0 and w_low >= 0:
+        v1 = data_im[h_low, w_low]
+    v2 = 0
+    if h_low >= 0 and w_high <= width - 1:
+        v2 = data_im[h_low, w_high]
+    v3 = 0
+    if h_high <= height - 1 and w_low >= 0:
+        v3 = data_im[h_high, w_low]
+    v4 = 0
+    if h_high <= height - 1 and w_high <= width - 1:
+        v4 = data_im[h_high, w_high]
+
+    w1, w2, w3, w4 = hh * hw, hh * lw, lh * hw, lh * lw
+    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+
+    return val
+
+
+def dconv_im2col_gemm(input, offset, filter, group, conv_param):
+    in_n, in_c, in_h, in_w = input.shape
+    out_c, f_c, f_h, f_w = filter.shape
+
+    assert offset.shape == (in_n, 2 * f_h * f_w, in_h, in_w)
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+
+    stride, pad, dilation = (
+        conv_param["stride"],
+        conv_param["pad"],
+        conv_param["dilation"],
+    )
+    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
+    assert out_h == in_h
+    assert out_w == in_w
+
+    col_buffer = np.zeros((in_n, in_c * f_h * f_w, in_h * in_w))
+    for n, c, h, w, kh, kw in product(
+        range(in_n),
+        range(in_c),
+        range(out_h),
+        range(out_w),
+        range(f_h),
+        range(f_w),
+    ):
+        offset_h_table = offset[n, ::2, h, w].reshape(f_h, f_w)
+        offset_w_table = offset[n, 1::2, h, w].reshape(f_h, f_w)
+        offset_h = offset_h_table[kh, kw]
+        offset_w = offset_w_table[kh, kw]
+        val = 0
+        im_h = h * stride[0] + kh * dilation[0] + offset_h - pad[0]
+        im_w = w * stride[0] + kw * dilation[0] + offset_w - pad[1]
+        if im_h > -1 and im_w > -1 and im_h < in_h and im_w < in_h:
+            val = dmc_bilinear(input[n, c], in_h, in_w, im_h, im_w)
+        val_out = val
+
+        col_buffer[n, c * f_h * f_w + kh * f_w + kw, h * in_w + w] = val_out
+
+    out = np.zeros((in_n, group, int(out_c // group), out_h * out_w))
+    weight = filter.reshape(group, int(out_c // group), f_c * f_h * f_w)
+    col_buffer = col_buffer.reshape(
+        (in_n, group, int(in_c // group * f_h * f_w), in_h * in_w)
+    )
+    for n in range(in_n):
+        for g in range(group):
+            out[n, g] = np.matmul(weight[g], col_buffer[n, g])
+    out = out.reshape(in_n, out_c, out_h, out_w)
+    return out
+
+
+def deform_conv2d_wrapper(
+    x,
+    offset,
+    weight,
+    mask=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    deformable_groups=1,
+    groups=1,
+    im2col_step=1,
+):
+    return paddle.vision.ops.deform_conv2d(
+        x,
+        offset,
+        weight,
+        None,
+        stride,
+        padding,
+        dilation,
+        deformable_groups,
+        groups,
+        mask,
+    )
+
+
+class TestModulatedDeformableConvOp(OpTest):
+    def setUp(self):
+        self.python_api = deform_conv2d_wrapper
+        self.op_type = "deformable_conv_v1"
+        self.init_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv_param = {
+            "stride": self.stride,
+            "pad": self.pad,
+            "dilation": self.dilations,
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        offset = 10 * np.random.random(self.offset_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
+
+        output = dconv_im2col_gemm(input, offset, filter, self.groups, conv_param)
+        output = output.astype(self.dtype)
+        self.inputs = {
+            "Input": OpTest.np_dtype_to_base_dtype(input),
+            "Offset": OpTest.np_dtype_to_base_dtype(offset),
+            "Filter": OpTest.np_dtype_to_base_dtype(filter),
+        }
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "groups": self.groups,
+            "deformable_groups": self.deformable_groups,
+            "im2col_step": self.im2col_step,
+            "dilations": self.dilations,
+        }
+        self.outputs = {"Output": output}
+
+    def test_check_output(self):
+        self.check_output(check_pir=True)
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["Input", "Offset", "Filter"],
+            "Output",
+            max_relative_error=0.05,
+            check_pir=True,
+        )
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(
+            ["Input", "Offset"],
+            "Output",
+            max_relative_error=0.1,
+            no_grad_set={"Filter"},
+            check_pir=True,
+        )
+
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 4, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_type(self):
+        self.dtype = np.float32
+
+
+class TestWithStride(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [3, 3]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithDilation(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [2, 2]
+        self.stride = [1, 1]
+        self.input_size = [5, 3, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+
+class TestWith1x1(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [40, f_c, 1, 1]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+
+class TestWithGroup(TestModulatedDeformableConvOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 8, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [4, f_c, 3, 3]
+        self.im2col_step = 1
+        self.deformable_groups = 1
+        offset_c = (
+            2 * self.deformable_groups * self.filter_size[2] * self.filter_size[3]
+        )
+        self.offset_size = [
+            self.input_size[0],
+            offset_c,
+            self.input_size[2],
+            self.input_size[3],
+        ]
+
+    def init_group(self):
+        self.groups = 2
+
+
+class TestWithDouble(TestModulatedDeformableConvOp):
+    def init_type(self):
+        self.dtype = np.float64
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_einsum_0d_tensor_metax.py b/backends/metax_gpu/tests/unit_test/test_einsum_0d_tensor_metax.py
new file mode 100644
index 00000000000..f3f3bb30e34
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_einsum_0d_tensor_metax.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+
+import numpy as np
+from numpy.testing import assert_allclose
+
+import paddle
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
+
+
+class Test0DCase0(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+    def tearDown(self):
+        paddle.enable_static()
+
+    def test_func(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        y = paddle.rand([])
+        y.stop_gradient = False
+        z = paddle.einsum("...,...->...", x, y)
+        assert_allclose(
+            z.numpy(),
+            np.einsum("...,...->...", x.numpy(), y.numpy()),
+            atol=1e-6,
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == []
+        assert y.grad.shape == []
+
+
+class Test0DCase1(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("...,ij->...", x, y)
+        assert_allclose(
+            z.numpy(), np.einsum("...,ij->...", x.numpy(), y.numpy()), atol=1e-6
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == []
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase2(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = False
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("ij,ij->", x, y)
+        assert_allclose(
+            z.numpy(), np.einsum("ij,ij->", x.numpy(), y.numpy()), atol=1e-6
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase3(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = True
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("ij,ij->", x, y)
+        assert_allclose(
+            z.numpy(), np.einsum("ij,ij->", x.numpy(), y.numpy()), atol=1e-6
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad is None
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase4(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([])
+        x.stop_gradient = False
+        z = paddle.einsum("...->...", x)
+        assert_allclose(z.numpy(), np.einsum("...->...", x.numpy()), atol=1e-6)
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == []
+        assert x.grad.numpy() == 1.0
+
+
+class Test0DCase5(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = False
+        y = paddle.rand([2, 2])
+        y.stop_gradient = False
+        z = paddle.einsum("i...j, i...j->...", x, y)
+        assert_allclose(
+            z.numpy(),
+            np.einsum("i...j, i...j->...", x.numpy(), y.numpy()),
+            atol=1e-6,
+        )
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == [2, 2]
+
+
+class Test0DCase6(Test0DCase0):
+    def test_func(self):
+        x = paddle.rand([2, 2])
+        x.stop_gradient = False
+        z = paddle.einsum("ij->", x)
+        assert_allclose(z.numpy(), np.einsum("ij->", x.numpy()), atol=1e-6)
+        z.mean().backward()
+        assert z.shape == []
+        assert x.grad.shape == [2, 2]
+
+
+class Test0DCase7(Test0DCase0):
+    def test_func(self):
+        """
+        3 operands.
+        """
+        x = paddle.rand([2, 2])
+        y = paddle.rand([])
+        z = paddle.rand([])
+        x.stop_gradient = False
+        y.stop_gradient = False
+        z.stop_gradient = False
+        o = paddle.einsum("ij...,...,...->...", x, y, z)
+        assert_allclose(
+            o.numpy(),
+            np.einsum("ij...,...,...->...", x.numpy(), y.numpy(), z.numpy()),
+            atol=1e-6,
+        )
+        o.mean().backward()
+        assert o.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == []
+        assert z.grad.shape == []
+
+
+class Test0DCase8(Test0DCase0):
+    def test_func(self):
+        """
+        3 operands.
+        """
+        x = paddle.rand([2, 2])
+        y = paddle.rand([])
+        z = paddle.rand([])
+        e = paddle.rand([3, 1])
+        x.stop_gradient = False
+        y.stop_gradient = False
+        z.stop_gradient = False
+        e.stop_gradient = False
+        o = paddle.einsum("ij...,...,..., km->...", x, y, z, e)
+        assert_allclose(
+            o.numpy(),
+            np.einsum(
+                "ij...,...,...,km->...",
+                x.numpy(),
+                y.numpy(),
+                z.numpy(),
+                e.numpy(),
+            ),
+            atol=1e-6,
+        )
+        o.mean().backward()
+        assert o.shape == []
+        assert x.grad.shape == [2, 2]
+        assert y.grad.shape == []
+        assert z.grad.shape == []
+        assert e.grad.shape == [3, 1]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_fc_op_metax.py b/backends/metax_gpu/tests/unit_test/test_fc_op_metax.py
new file mode 100644
index 00000000000..67afd71c5f9
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_fc_op_metax.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+SEED = 2020
+
+
+def fc_refer(matrix, with_bias, with_relu=False):
+    in_n, in_c, in_h, in_w = matrix.input.shape
+    w_i, w_o = matrix.weights.shape
+
+    x_data = np.reshape(matrix.input, [in_n, in_c * in_h * in_w])
+    w_data = np.reshape(matrix.weights, [w_i, w_o])
+    b_data = np.reshape(matrix.bias, [1, w_o])
+    result = None
+
+    if with_bias:
+        result = np.dot(x_data, w_data) + b_data
+    else:
+        result = np.dot(x_data, w_data)
+
+    if with_relu:
+        return np.maximum(result, 0)
+    else:
+        return result
+
+
+class MatrixGenerate:
+    def __init__(self, mb, ic, oc, h, w, bias_dims=2):
+        self.input = np.random.random((mb, ic, h, w)).astype("float32")
+        self.weights = np.random.random((ic * h * w, oc)).astype("float32")
+        if bias_dims == 2:
+            self.bias = np.random.random((1, oc)).astype("float32")
+        else:
+            self.bias = np.random.random(oc).astype("float32")
+
+
+class TestFCOp(OpTest):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 10, 15, 3, 3, 2)
+
+    def setUp(self):
+        self.op_type = "fc"
+        self.config()
+
+        if self.with_bias:
+            self.inputs = {
+                "Input": self.matrix.input,
+                "W": self.matrix.weights,
+                "Bias": self.matrix.bias,
+            }
+        else:
+            self.inputs = {"Input": self.matrix.input, "W": self.matrix.weights}
+
+        if self.with_relu:
+            activation_type = "relu"
+        else:
+            activation_type = ""
+        self.attrs = {"use_onednn": False, "activation_type": activation_type}
+
+        self.outputs = {"Out": fc_refer(self.matrix, self.with_bias, self.with_relu)}
+
+    def test_check_output(self):
+        self.check_output(check_dygraph=False)
+
+
+class TestFCOpNoBias1(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(2, 8, 10, 1, 1, 2)
+
+
+class TestFCOpNoBias2(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
+
+
+class TestFCOpNoBias4(TestFCOp):
+    def config(self):
+        self.with_bias = False
+        self.with_relu = False
+        self.matrix = MatrixGenerate(1, 32, 64, 3, 3, 1)
+
+
+class TestFCOpWithBias1(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = False
+        self.matrix = MatrixGenerate(3, 8, 10, 2, 1, 2)
+
+
+class TestFCOpWithBias2(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(4, 5, 6, 2, 2, 1)
+
+
+class TestFCOpWithBias3(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 64, 32, 3, 3, 1)
+
+
+class TestFCOpWithPadding(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 4, 3, 128, 128, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_fused_conv2d_add_act_op_metax.py b/backends/metax_gpu/tests/unit_test/test_fused_conv2d_add_act_op_metax.py
new file mode 100644
index 00000000000..2b405a76367
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_fused_conv2d_add_act_op_metax.py
@@ -0,0 +1,429 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from op_test import OpTest, get_device_place, is_custom_device
+from test_conv2d_op import conv2d_forward_naive
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+
+def create_test_padding_SAME_class(parent):
+    class TestPaddingSAMECase(parent):
+        def init_paddings(self):
+            self.pad = [0, 0]
+            self.padding_algorithm = "SAME"
+
+    cls_name = "{}_{}".format(parent.__name__, "PaddingSAMEOp")
+    TestPaddingSAMECase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingSAMECase
+
+
+def create_test_padding_VALID_class(parent):
+    class TestPaddingVALIDCase(parent):
+        def init_paddings(self):
+            self.pad = [1, 1]
+            self.padding_algorithm = "VALID"
+
+    cls_name = "{}_{}".format(parent.__name__, "PaddingVALIDOp")
+    TestPaddingVALIDCase.__name__ = cls_name
+    globals()[cls_name] = TestPaddingVALIDCase
+
+
+def create_test_cudnn_channel_last_class(parent):
+    @unittest.skipIf(
+        not (core.is_compiled_with_cuda() or is_custom_device()),
+        "core is not compiled with CUDA",
+    )
+    class TestCudnnChannelLastCase(parent):
+        def init_test_case(self):
+            super().init_test_case()
+            self.data_format = "NHWC"
+            N, C, H, W = self.input_size
+            self.input_size = [N, H, W, C]
+            K1, K2, R, S = self.filter_size
+            self.filter_size = [K1, R, S, K2]
+
+        def test_check_output(self):
+            print(self.attrs)
+            if self.has_cuda():
+                place = get_device_place()
+                self.check_output_with_place(place, atol=1e-5, check_dygraph=False)
+
+    cls_name = "{}_{}".format(parent.__name__, "CudnnChannelLast")
+    TestCudnnChannelLastCase.__name__ = cls_name
+    globals()[cls_name] = TestCudnnChannelLastCase
+
+
+class TestFusedConv2dAddActOp(OpTest):
+    def setUp(self):
+        self.op_type = "fused_conv2d_add_act"
+        self.exhaustive_search = False
+        self.data_format = "NCHW"
+        self.dtype = np.float32
+        self.activation = "relu"
+        self.add_residual_data = True
+        self.split_channels = None
+        self.outputs = None
+        self.padding_algorithm = "EXIPLICIT"
+
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+        self.init_residual()
+        self.init_activation()
+        self.init_paddings()
+        self.set_search_method()
+
+        conv2d_param = {
+            "stride": self.stride,
+            "pad": self.pad,
+            "dilation": self.dilations,
+        }
+
+        input = np.random.random(self.input_size).astype(self.dtype)
+        filter = np.random.random(self.filter_size).astype(self.dtype)
+        bias = np.random.random(self.filter_size[0]).astype(self.dtype)
+
+        if self.data_format == "NHWC":
+            filter_nchw = np.transpose(filter, [0, 3, 1, 2])
+        else:
+            filter_nchw = filter
+
+        self.output, _, _, _, _ = conv2d_forward_naive(
+            input,
+            filter_nchw,
+            self.groups,
+            conv2d_param,
+            self.padding_algorithm,
+            self.data_format,
+        )
+
+        self.output = self.output.astype(self.dtype)
+
+        self.inputs = {
+            "Input": OpTest.np_dtype_to_base_dtype(input),
+            "Filter": OpTest.np_dtype_to_base_dtype(filter),
+            "Bias": OpTest.np_dtype_to_base_dtype(bias),
+        }
+
+        if self.add_residual_data:
+            residual_data = np.random.random(self.output.shape).astype(self.dtype)
+            self.inputs["ResidualData"] = OpTest.np_dtype_to_base_dtype(residual_data)
+            self.output += residual_data
+
+        # Add bias
+        if self.data_format == "NCHW":
+            self.output = self.output + bias.reshape((1, bias.size, 1, 1))
+        else:
+            self.output = self.output + bias.reshape((1, 1, 1, bias.size))
+
+        assert self.activation in ["relu", "identity"]
+        if self.activation == "relu":
+            self.output = np.maximum(self.output, 0)
+
+        self.attrs = {
+            "strides": self.stride,
+            "paddings": self.pad,
+            "groups": self.groups,
+            "dilations": self.dilations,
+            "data_format": self.data_format,
+            "exhaustive_search": self.exhaustive_search,
+            "activation": self.activation,
+            "padding_algorithm": self.padding_algorithm,
+        }
+        if self.split_channels is not None:
+            self.attrs["split_channels"] = self.split_channels
+
+        self.outputs = {"Output": self.output}
+
+        self.set_outputs()
+
+    def has_cuda(self):
+        return core.is_compiled_with_cuda() or is_custom_device()
+
+    def test_check_output(self):
+        if self.has_cuda():
+            place = get_device_place()
+            self.check_output_with_place(place, atol=1e-5, check_dygraph=False)
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_residual(self):
+        self.add_residual_data = True
+
+    def init_activation(self):
+        self.activation = "relu"
+
+    def set_search_method(self):
+        self.exhaustive_search = False
+
+    def set_outputs(self):
+        pass
+
+    def init_paddings(self):
+        self.pad = [0, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithoutResidual(TestFusedConv2dAddActOp):
+    def init_residual(self):
+        self.add_residual_data = False
+
+
+class TestIdentityActivation(TestFusedConv2dAddActOp):
+    def init_activation(self):
+        self.activation = "identity"
+
+
+class TestIdentityActivation1(TestFusedConv2dAddActOp):
+    def init_activation(self):
+        self.activation = "identity"
+        self.add_residual_data = False
+
+
+class TestWithGroup(TestFusedConv2dAddActOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithDilation(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestCUDNNExhaustiveSearch(TestFusedConv2dAddActOp):
+    def set_search_method(self):
+        self.exhaustive_search = True
+
+
+class TestMultipleOutputs(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [1, 32, 17, 17]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [126, f_c, 3, 3]
+        self.split_channels = [84, 42]
+
+    def set_outputs(self):
+        out1 = self.output[:, 0:84, :, :]
+        out2 = self.output[:, 84:126, :, :]
+        self.outputs["Outputs"] = [("out1", out1), ("out2", out2)]
+
+
+class TestAsyPadding(TestFusedConv2dAddActOp):
+    def init_paddings(self):
+        self.pad = [0, 0, 1, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithPad_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithStride_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_paddings(self):
+        self.pad = [2, 1, 3, 2]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWith1x1_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [2, 2, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithGroup_AsyPadding(TestFusedConv2dAddActOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithDepthWise3x3_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [3, 4, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [8, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 4
+
+    def init_paddings(self):
+        self.pad = [1, 3, 2, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise5x5_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 4, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [8, f_c, 5, 5]
+
+    def init_group(self):
+        self.groups = 4
+
+    def init_paddings(self):
+        self.pad = [0, 1, 1, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDepthWise7x7_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [2, 2]
+        self.input_size = [2, 8, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [16, f_c, 7, 7]
+
+    def init_group(self):
+        self.groups = 8
+
+    def init_paddings(self):
+        self.pad = [1, 3, 4, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithDilation_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [0, 1, 3, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestWithInput1x1Filter1x1_AsyPadding(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 1, 1]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+    def init_paddings(self):
+        self.pad = [0, 3, 4, 0]
+        self.padding_algorithm = "EXPLICIT"
+
+
+class TestSimpleNHWC(TestFusedConv2dAddActOp):
+    def init_test_case(self):
+        self.stride = [1, 1]
+        self.input_size = [3, 5, 5, 2]  # NHWC
+        self.data_format = "NHWC"
+        assert np.mod(self.input_size[3], self.groups) == 0
+        f_c = self.input_size[3] // self.groups
+        self.filter_size = [4, 3, 3, f_c]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_paddings(self):
+        self.pad = [1, 1]
+        self.padding_algorithm = "EXPLICIT"
+
+
+create_test_padding_SAME_class(TestAsyPadding)
+create_test_padding_SAME_class(TestWithPad_AsyPadding)
+create_test_padding_SAME_class(TestWithStride_AsyPadding)
+create_test_padding_SAME_class(TestWithGroup_AsyPadding)
+create_test_padding_SAME_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+create_test_padding_VALID_class(TestAsyPadding)
+create_test_padding_VALID_class(TestWithPad_AsyPadding)
+create_test_padding_VALID_class(TestWithStride_AsyPadding)
+create_test_padding_VALID_class(TestWithGroup_AsyPadding)
+create_test_padding_VALID_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+create_test_cudnn_channel_last_class(TestAsyPadding)
+create_test_cudnn_channel_last_class(TestWithPad_AsyPadding)
+create_test_cudnn_channel_last_class(TestWithStride_AsyPadding)
+create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
+create_test_cudnn_channel_last_class(TestWithInput1x1Filter1x1_AsyPadding)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_imperative_double_grad_metax.py b/backends/metax_gpu/tests/unit_test/test_imperative_double_grad_metax.py
new file mode 100644
index 00000000000..803b00cc6b4
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_imperative_double_grad_metax.py
@@ -0,0 +1,1106 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from unittest import TestCase
+
+import numpy as np
+from op_test import get_device, is_custom_device
+
+import paddle
+import paddle.nn.functional as F
+from paddle import base
+from paddle.base.wrapped_decorator import wrap_decorator
+from paddle.vision.models import resnet50, resnet101
+
+from paddle.base import core
+
+core.set_cudnn_switch(False)
+
+core.set_cublas_switch(False)
+
+
+def _dygraph_guard_(func):
+    def __impl__(*args, **kwargs):
+        if base.in_dygraph_mode():
+            return func(*args, **kwargs)
+        else:
+            with base.dygraph.guard():
+                return func(*args, **kwargs)
+
+    return __impl__
+
+
+dygraph_guard = wrap_decorator(_dygraph_guard_)
+
+
+def random_var(size, low=-1, high=1, dtype="float32"):
+    x_np = np.random.uniform(low=low, high=high, size=size).astype(dtype)
+    return paddle.to_tensor(x_np)
+
+
+class TestEagerGrad(TestCase):
+    def test_simple_example_eager_grad(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        out = paddle.matmul(x, y)
+        dx = base.dygraph.grad(out, x)
+
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        np.testing.assert_allclose(dx[0].numpy(), expected_dx, rtol=1e-05)
+
+    def test_simple_example_eager_grad_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        dx = base.dygraph.grad(out, [x, z], allow_unused=True)
+        dout = np.ones_like(np_y)
+        expected_dx = np.matmul(dout, np.transpose(np_y))
+        np.testing.assert_allclose(dx[0].numpy(), expected_dx, rtol=1e-05)
+        # stop_gradient = !create_graph, create_graph default false
+        self.assertEqual(dx[0].stop_gradient, True)
+        # x is unused input in the graph
+        self.assertIsNone(dx[1])
+
+    def test_simple_example_eager_grad_not_allow_unused(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # allow_unused is false in default
+            dx = base.dygraph.grad(out, [x, z])
+        except ValueError as e:
+            error_msg = str(e)
+            assert error_msg.find("allow_unused") > 0
+
+    def test_simple_example_eager_grad_duplicate_input(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # duplicate input will arise RuntimeError errors
+            dx = base.dygraph.grad(out, [x, x])
+        except RuntimeError as e:
+            error_msg = str(e)
+            assert error_msg.find("duplicate") > 0
+
+    def test_simple_example_eager_grad_duplicate_output(self):
+        np.random.seed(2021)
+        paddle.set_device("cpu")
+        np_x = np.random.random((3, 3))
+        np_y = np.random.random((3, 1))
+        np_z = np.random.random((3, 1))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
+        z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
+        out_z = paddle.nn.functional.sigmoid(z)
+        out = paddle.matmul(x, y)
+
+        try:
+            # duplicate output will arise RuntimeError errors
+            dx = base.dygraph.grad([out, out], [x])
+        except RuntimeError as e:
+            error_msg = str(e)
+            assert error_msg.find("duplicate") > 0
+
+    def test_simple_example_eager_two_grad_output(self):
+        x1 = paddle.to_tensor([1.0, 2.0])
+        x1.stop_gradient = False
+        x2 = paddle.to_tensor([1.0, 2.0])
+        x2.stop_gradient = False
+        out1 = x1 * 2
+        out2 = x2 * 2
+
+        dout2_record_by_hook = []
+
+        def record_hook(grad):
+            dout2_record_by_hook.append(grad)
+
+        out2.register_hook(record_hook)
+
+        out3 = paddle.multiply(out1, out2)
+        out4 = paddle.mean(out3)
+        egr_dout2, egr_dout3 = paddle.grad([out4], [out2, out3])
+
+        np.testing.assert_array_equal(
+            dout2_record_by_hook[0].numpy(), np.array([1.0, 2.0])
+        )
+
+        x1 = paddle.to_tensor([1.0, 2.0])
+        x1.stop_gradient = False
+        x2 = paddle.to_tensor([1.0, 2.0])
+        x2.stop_gradient = False
+        out1 = x1 * 2
+        out2 = x2 * 2
+
+        out3 = paddle.multiply(out1, out2)
+        out4 = paddle.mean(out3)
+        dout2, dout3 = paddle.grad([out4], [out2, out3])
+
+        self.assertEqual(dout2.stop_gradient, egr_dout2.stop_gradient)
+        self.assertEqual(dout3.stop_gradient, egr_dout3.stop_gradient)
+        np.testing.assert_array_equal(dout2.numpy(), egr_dout2.numpy())
+        np.testing.assert_array_equal(dout3.numpy(), egr_dout3.numpy())
+
+
+class TestDygraphDoubleGrad(TestCase):
+    def setUp(self):
+        self.sort_sum_gradient = False
+        self.shape = [5, 10]
+
+    def grad(
+        self,
+        outputs,
+        inputs,
+        grad_outputs=None,
+        no_grad_vars=None,
+        retain_graph=None,
+        create_graph=False,
+        allow_unused=False,
+    ):
+        base.set_flags({"FLAGS_sort_sum_gradient": self.sort_sum_gradient})
+        return base.dygraph.grad(
+            outputs=outputs,
+            inputs=inputs,
+            grad_outputs=grad_outputs,
+            no_grad_vars=no_grad_vars,
+            retain_graph=retain_graph,
+            create_graph=create_graph,
+            allow_unused=allow_unused,
+        )
+
+    @dygraph_guard
+    def test_exception(self):
+        with self.assertRaises(AssertionError):
+            self.grad(None, None)
+
+        shape = self.shape
+
+        with self.assertRaises(AssertionError):
+            self.grad(1, random_var(shape))
+
+        with self.assertRaises(AssertionError):
+            self.grad(random_var(shape), 1)
+
+        with self.assertRaises(AssertionError):
+            self.grad([1], [random_var(shape)])
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape)], [1])
+
+        with self.assertRaises(AssertionError):
+            self.grad(
+                [random_var(shape), random_var(shape)],
+                [random_var(shape)],
+                [random_var(shape)],
+            )
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=[1])
+
+        with self.assertRaises(AssertionError):
+            self.grad([random_var(shape)], [random_var(shape)], no_grad_vars=1)
+
+    @dygraph_guard
+    def test_simple_example(self):
+        x = random_var(self.shape)
+        x.stop_gradient = False
+        y = x + 1
+
+        for create_graph in [False, True]:
+            (dx,) = self.grad([x], [x], create_graph=create_graph, retain_graph=True)
+            self.assertEqual(dx.shape, x.shape)
+            self.assertTrue(np.all(dx.numpy() == 1))
+            self.assertNotEqual(dx.stop_gradient, create_graph)
+
+            (dx_mul_2,) = self.grad(
+                [y, x], [x], create_graph=create_graph, retain_graph=True
+            )
+            self.assertEqual(dx_mul_2.shape, x.shape)
+            self.assertTrue(np.all(dx_mul_2.numpy() == 2))
+            self.assertNotEqual(dx_mul_2.stop_gradient, create_graph)
+
+            (none_grad,) = self.grad(
+                [x], [y], create_graph=create_graph, allow_unused=True
+            )
+            self.assertIsNone(none_grad)
+
+            (grad_with_none_and_not_none,) = self.grad(
+                [x, y], [y], create_graph=create_graph
+            )
+            self.assertTrue(grad_with_none_and_not_none.shape, x.shape)
+            self.assertTrue(np.all(grad_with_none_and_not_none.numpy() == 1))
+            self.assertNotEqual(grad_with_none_and_not_none.stop_gradient, create_graph)
+
+    @dygraph_guard
+    def test_example_no_grad_vars(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y1 = F.relu(x)
+        y2 = F.relu(x)
+        z = y1 + y2
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y1, z, w
+
+        (dx_actual,) = self.grad([w_mean], [x], create_graph=True, no_grad_vars=[y2])
+
+        self.assertFalse(y2.stop_gradient)
+        self.assertFalse(dx_actual.stop_gradient)
+
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) * (x_np > 0) * 2
+        ).astype("float32")
+
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+    @dygraph_guard
+    def test_none_one_initial_gradient(self):
+        numel = 1
+        for s in self.shape:
+            numel *= s
+
+        half_numel = int(numel / 2)
+        half_x_positive = np.random.uniform(low=1, high=2, size=[half_numel])
+        half_x_negative = np.random.uniform(low=-2, high=-1, size=[numel - half_numel])
+        x_np = np.array(list(half_x_positive) + list(half_x_negative)).astype("float32")
+        np.random.shuffle(x_np)
+
+        x = paddle.to_tensor(x_np)
+        x.stop_gradient = False
+
+        alpha = 0.2
+        y = paddle.nn.functional.leaky_relu(x, alpha)
+        y = y * y
+        z = y * y
+
+        x_np = x.numpy()
+        relu_x_np = np.maximum(x_np, alpha * x_np).astype("float32")
+        relu_x_grad_np = ((x_np > 0) + (x_np < 0) * alpha).astype("float32")
+        dy_expected = (relu_x_np * relu_x_grad_np * 2).astype("float32")
+        dz_expected = (np.power(relu_x_np, 3) * relu_x_grad_np * 4).astype("float32")
+
+        random_grad_y = random_var(y.shape, low=1, high=2)
+        random_grad_z = random_var(z.shape, low=1, high=2)
+        ones_grad_y = np.ones(y.shape).astype("float32")
+        ones_grad_z = np.ones(z.shape).astype("float32")
+
+        original_random_grad_y = random_grad_y.numpy()
+        original_random_grad_z = random_grad_z.numpy()
+
+        for grad_y in [random_grad_y]:
+            for grad_z in [random_grad_z]:
+                for create_graph in [False, True]:
+                    (dx_actual,) = self.grad(
+                        outputs=[y, z],
+                        inputs=[x],
+                        grad_outputs=[grad_y, grad_z],
+                        create_graph=create_graph,
+                        retain_graph=True,
+                    )
+
+                    grad_y_np = ones_grad_y if grad_y is None else grad_y.numpy()
+                    grad_z_np = ones_grad_z if grad_z is None else grad_z.numpy()
+
+                    dx_expected = dy_expected * grad_y_np + dz_expected * grad_z_np
+                    np.testing.assert_allclose(
+                        dx_actual.numpy(), dx_expected, rtol=1e-05
+                    )
+
+                    if grad_y is not None:
+                        self.assertTrue(grad_y.stop_gradient)
+                        np.testing.assert_array_equal(
+                            grad_y.numpy(), original_random_grad_y
+                        )
+
+                    if grad_z is not None:
+                        self.assertTrue(grad_z.stop_gradient)
+                        np.testing.assert_array_equal(
+                            grad_z.numpy(), original_random_grad_z
+                        )
+
+    @dygraph_guard
+    def test_example_with_gradient_accumulation_and_create_graph(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y = F.relu(x)
+        z = y + 1
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y, z, w
+
+        (dx_actual,) = self.grad([w_mean], [x], create_graph=True)
+        del w_mean
+
+        self.assertFalse(dx_actual.stop_gradient)
+
+        # Theoretical result based on math calculation
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + 1) * (x_np > 0) * 2
+        ).astype("float32")
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
+        loss.backward(retain_graph=True)
+
+        x_grad_actual = x.gradient()
+        x_grad_expected = (
+            2.0 / float(numel) * (x_np + dx_expected * (x_np > 0) * 2 / float(numel))
+        ).astype("float32")
+        np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+        for i in range(5):
+            loss.backward(retain_graph=True)
+            x_grad_actual = x.gradient()
+            x_grad_expected = (i + 2) * (
+                2.0
+                / float(numel)
+                * (x_np + dx_expected * (x_np > 0) * 2 / float(numel))
+            ).astype("float32")
+            np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+    @dygraph_guard
+    def test_example_with_gradient_accumulation_and_no_grad_vars(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y1 = F.relu(x)
+        y2 = F.relu(x)
+        z = y1 + y2
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y1, z, w
+
+        (dx_actual,) = self.grad(
+            [w_mean],
+            [x],
+            retain_graph=True,
+            create_graph=True,
+            no_grad_vars=[y2],
+        )
+
+        self.assertFalse(y2.stop_gradient)
+        self.assertFalse(dx_actual.stop_gradient)
+
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + y2.numpy()) * (x_np > 0) * 2
+        ).astype("float32")
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
+        loss.backward()
+
+        x_grad_actual = x.gradient()
+        x_grad_expected = (
+            2.0 / float(numel) * (x_np + dx_expected * (x_np > 0) * 4 / float(numel))
+        ).astype("float32")
+        np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+    @dygraph_guard
+    def test_example_with_gradient_accumulation_and_not_create_graph(self):
+        x = random_var(self.shape)
+        x_np = x.numpy()
+        numel = x_np.size
+        x.stop_gradient = False
+
+        y = F.relu(x)
+        z = y + 1
+        w = z * z
+
+        w_mean = paddle.mean(w)
+        del y, z, w
+
+        (dx_actual,) = self.grad([w_mean], [x], create_graph=False)
+        del w_mean
+
+        self.assertTrue(dx_actual.stop_gradient)
+
+        dx_expected = (
+            1.0 / float(numel) * (np.maximum(x_np, 0) + 1) * (x_np > 0) * 2
+        ).astype("float32")
+
+        np.testing.assert_allclose(dx_actual.numpy(), dx_expected, rtol=1e-05)
+
+        loss = paddle.mean(dx_actual * dx_actual + x * x)
+        loss.backward()
+
+        x_grad_actual = x.gradient()
+        x_grad_expected = (2.0 * x_np / float(numel)).astype("float32")
+        np.testing.assert_allclose(x_grad_actual, x_grad_expected, rtol=1e-05)
+
+
+class TestDygraphDoubleGradSortGradient(TestDygraphDoubleGrad):
+    def setUp(self):
+        self.sort_sum_gradient = True
+        self.shape = [5, 10]
+
+
+class TestDygraphDoubleGradVisitedUniq(TestCase):
+    def test_compare(self):
+        value = np.random.uniform(-0.5, 0.5, 100).reshape(10, 2, 5).astype("float32")
+
+        def model_f(input):
+            linear = paddle.nn.Linear(5, 3)
+            for i in range(10):
+                if i == 0:
+                    out = linear(input)
+                else:
+                    out = out + linear(input)
+            return out
+
+        base.set_flags({"FLAGS_sort_sum_gradient": True})
+
+        with base.dygraph.guard():
+            paddle.seed(123)
+            if paddle.framework.use_pir_api():
+                with paddle.pir_utils.OldIrGuard():
+                    # Note: dygraph use self.main_program.global_block().create_parameter(), it's need manual seed to old Program
+                    paddle.framework.random._manual_program_seed(123)
+                paddle.framework.random._manual_program_seed(123)
+            else:
+                paddle.framework.random._manual_program_seed(123)
+            a = paddle.to_tensor(value)
+            a.stop_gradient = False
+
+            out = model_f(a)
+
+            dx = base.dygraph.grad(
+                outputs=[out],
+                inputs=[a],
+                create_graph=False,
+                only_inputs=True,
+                allow_unused=False,
+            )
+
+            grad_1 = dx[0].numpy()
+
+        with base.dygraph.guard():
+            paddle.seed(123)
+            if paddle.framework.use_pir_api():
+                with paddle.pir_utils.OldIrGuard():
+                    # Note: dygraph use self.main_program.global_block().create_parameter(), it's need manual seed to old Program
+                    paddle.framework.random._manual_program_seed(123)
+                paddle.framework.random._manual_program_seed(123)
+            else:
+                paddle.framework.random._manual_program_seed(123)
+            a = paddle.to_tensor(value)
+            a.stop_gradient = False
+
+            out = model_f(a)
+            out.backward()
+
+            grad_2 = a.gradient()
+
+        np.testing.assert_array_equal(grad_1, grad_2)
+
+
+class TestDoubleGradResNet(TestCase):
+    def setUp(self):
+        paddle.seed(123)
+        if paddle.framework.use_pir_api():
+            with paddle.pir_utils.OldIrGuard():
+                # Note: dygraph use self.main_program.global_block().create_parameter(), it's need manual seed to old Program
+                paddle.framework.random._manual_program_seed(123)
+            paddle.framework.random._manual_program_seed(123)
+        else:
+            paddle.framework.random._manual_program_seed(123)
+        self.data = np.random.rand(1, 3, 224, 224).astype(np.float32)
+
+    @dygraph_guard
+    def test_resnet_resnet50(self):
+        model = resnet50(pretrained=False)
+        egr_data = paddle.to_tensor(self.data)
+        egr_data.stop_gradient = False
+        egr_out = model(egr_data)
+        egr_preds = paddle.argmax(egr_out, axis=1)
+        egr_label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(egr_preds), num_classes=egr_out.shape[1]
+        )
+        egr_target = paddle.sum(egr_out * egr_label_onehot, axis=1)
+
+        egr_g = paddle.grad(outputs=egr_target, inputs=egr_out)[0]
+        egr_g_numpy = egr_g.numpy()
+        self.assertEqual(list(egr_g_numpy.shape), list(egr_out.shape))
+
+        model = resnet50(pretrained=False)
+        data = paddle.to_tensor(self.data)
+        data.stop_gradient = False
+        out = model(data)
+        preds = paddle.argmax(out, axis=1)
+        label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(preds), num_classes=out.shape[1]
+        )
+        target = paddle.sum(out * label_onehot, axis=1)
+
+        g = paddle.grad(outputs=target, inputs=out)[0]
+        g_numpy = g.numpy()
+        self.assertEqual(list(g_numpy.shape), list(out.shape))
+
+        np.testing.assert_array_equal(egr_out, out)
+        np.testing.assert_array_equal(egr_g_numpy, g_numpy)
+
+    @dygraph_guard
+    def test_resnet_resnet101(self):
+        model = resnet101(pretrained=False)
+        egr_data = paddle.to_tensor(self.data)
+        egr_data.stop_gradient = False
+        egr_out = model(egr_data)
+        egr_preds = paddle.argmax(egr_out, axis=1)
+        egr_label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(egr_preds), num_classes=egr_out.shape[1]
+        )
+        egr_target = paddle.sum(egr_out * egr_label_onehot, axis=1)
+
+        egr_g = paddle.grad(outputs=egr_target, inputs=egr_out)[0]
+        egr_g_numpy = egr_g.numpy()
+        self.assertEqual(list(egr_g_numpy.shape), list(egr_out.shape))
+
+        model = resnet101(pretrained=False)
+        data = paddle.to_tensor(self.data)
+        data.stop_gradient = False
+        out = model(data)
+        preds = paddle.argmax(out, axis=1)
+        label_onehot = paddle.nn.functional.one_hot(
+            paddle.to_tensor(preds), num_classes=out.shape[1]
+        )
+        target = paddle.sum(out * label_onehot, axis=1)
+
+        g = paddle.grad(outputs=target, inputs=out)[0]
+        g_numpy = g.numpy()
+        self.assertEqual(list(g_numpy.shape), list(out.shape))
+
+        np.testing.assert_array_equal(egr_out, out)
+        np.testing.assert_array_equal(egr_g_numpy, g_numpy)
+
+
+class TestDoubleGradBasics(TestCase):
+    def test_matmul(self):
+        input_numpy = np.ones([3, 3]) * 2
+        x = paddle.to_tensor(input_numpy, stop_gradient=False, dtype="float32")
+        y = paddle.to_tensor(input_numpy, stop_gradient=False, dtype="float32")
+        grad_out = paddle.to_tensor(
+            np.ones([3, 3]), stop_gradient=False, dtype="float32"
+        )
+
+        out = paddle.matmul(x, y, False, False)
+        new_x_g, new_y_g = paddle.grad(
+            [out], [x, y], [grad_out], retain_graph=True, create_graph=True
+        )
+        new_x_g.backward()
+
+        out_ref = np.ones([3, 3]) * 12.0
+        np.testing.assert_array_equal(out.numpy(), out_ref)
+
+        new_x_g_ref = np.ones([3, 3]) * 6.0
+        new_y_g_ref = np.ones([3, 3]) * 6.0
+        np.testing.assert_array_equal(new_x_g.numpy(), new_x_g_ref)
+        np.testing.assert_array_equal(new_y_g.numpy(), new_y_g_ref)
+
+        x_grad_ref = np.ones([3, 3]) * 0.0
+        np.testing.assert_array_equal(x.grad.numpy(), x_grad_ref)
+
+        y_grad_ref = np.ones([3, 3]) * 3.0
+        np.testing.assert_array_equal(y.grad.numpy(), y_grad_ref)
+
+        grad_out_grad_ref = np.ones([3, 3]) * 6.0
+        np.testing.assert_array_equal(grad_out.grad.numpy(), grad_out_grad_ref)
+
+
+class TestDygraphDoubleGradMatmul(TestCase):
+    # case1: ddy is none, no broadcast,dims != 1
+    def test_matmul_double_grad_case1(self):
+        input_numpy_x = np.random.random([3, 3]).astype("float32")
+        input_numpy_y = np.random.random([3, 3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            (dx, dy) = paddle.grad(
+                [out], [x, y], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            ddy = ddx
+            dx_double_grad, dy_double_grad, ddout = paddle.grad(
+                [dx, dy],
+                [x, y, dout],
+                [ddx, ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, dy_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"),
+                np.ones([3, 3], dtype="float32"),
+            )
+            dy_double_grad_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"),
+                np.ones([3, 3], dtype="float32"),
+            )
+            ddout_expected1 = np.matmul(np.ones([3, 3], dtype="float32"), input_numpy_y)
+            ddout_expected2 = np.matmul(input_numpy_x, np.ones([3, 3], dtype="float32"))
+            ddout_expected = ddout_expected1 + ddout_expected2
+            return (
+                dx_double_grad_expected,
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case2: ddx is none,no broadcast, dims != 1
+    def test_matmul_double_grad_case2(self):
+        input_numpy_x = np.random.random([3, 3]).astype("float32")
+        input_numpy_y = np.random.random([3, 3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(
+                np.ones([3, 3]), stop_gradient=False, dtype="float32"
+            )
+            # when x isnot be differentiate in first grad dy in second grad could be None in composite op
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.matmul(
+                np.ones([3, 3], dtype="float32"),
+                np.ones([3, 3], dtype="float32"),
+            )
+            ddout_expected = np.matmul(input_numpy_x, np.ones([3, 3], dtype="float32"))
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case3: ddx is none, dims = 1
+    def test_matmul_double_grad_case3(self):
+        input_numpy_x = np.random.random([3]).astype("float32")
+        input_numpy_y = np.random.random([3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([1]), stop_gradient=False, dtype="float32")
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(np.ones([3]), stop_gradient=False, dtype="float32")
+            # when x is not be differentiate in first grad, dy from second grad could be None in composite api.
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(input_numpy_x, np.ones([3], dtype="float32"))
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case4: ddy is none, dims = 1
+    def test_matmul_double_grad_case4(self):
+        input_numpy_x = np.random.random([3]).astype("float32")
+        input_numpy_y = np.random.random([3]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([1]), stop_gradient=False, dtype="float32")
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(np.ones([3]), stop_gradient=False, dtype="float32")
+            # when y is not be differentiate in first grad, dx from second grad could be None in composite api.
+            dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dy_double_grad, ddout
+
+        def expected():
+            dy_double_grad_expected = np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(input_numpy_y, np.ones([3], dtype="float32"))
+            return (
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case5: ddx is none, broadcast, dims != 1
+    def test_matmul_double_grad_case5(self):
+        input_numpy_x = np.random.random([2, 1]).astype("float32")
+        input_numpy_y = np.random.random([1]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([2]), stop_gradient=False, dtype="float32")
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(np.ones([1]), stop_gradient=False, dtype="float32")
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([2, 1], dtype="float32")
+            ddout_expected = np.matmul(input_numpy_x, np.ones([1], dtype="float32"))
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # case6: ddy is none, broadcast, dims != 1
+    def test_matmul_double_grad_case6(self):
+        input_numpy_x = np.random.random([2, 1]).astype("float32")
+        input_numpy_y = np.random.random([1]).astype("float32")
+
+        def actual():
+            x = paddle.to_tensor(input_numpy_x, stop_gradient=False, dtype="float32")
+            y = paddle.to_tensor(input_numpy_y, stop_gradient=False, dtype="float32")
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(np.ones([2]), stop_gradient=False, dtype="float32")
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([2, 1]), stop_gradient=False, dtype="float32"
+            )
+            dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dy_double_grad, ddout
+
+        def expected():
+            dy_double_grad_expected = np.ones([1], dtype="float32") * 2
+            ddout_expected = np.ones([2], dtype="float32") * input_numpy_y[0]
+            return (
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if paddle.is_compiled_with_cuda() or is_custom_device():
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(expected_results, actual_results):
+                np.testing.assert_allclose(expected_result, actual_result, rtol=1e-6)
+
+    # TODO(Ruting) test complex dtype when composite api support
+    """
+    # case7: ddx is none, dims = 1, complex dtype
+    def test_matmul_double_grad_case7(self):
+        input_numpy_x = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y_conj = np.conjugate(input_numpy_y)
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='complex64'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='complex64'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([1]), stop_gradient=False, dtype='complex64'
+            )
+            (dx,) = paddle.grad(
+                [out], [x], [dout], retain_graph=True, create_graph=True
+            )
+            ddx = paddle.to_tensor(
+                np.ones([3]), stop_gradient=False, dtype='complex64'
+            )
+            # when y is not be differentiate in first grad, dx from second grad could be None in composite api.
+            dy_double_grad, ddout = paddle.grad(
+                [dx],
+                [y, dout],
+                [ddx],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dy_double_grad, ddout
+
+        def expected():
+            dy_double_grad_expected = np.ones(
+                [3], dtype="float32"
+            ) + 0j * np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_y_conj, np.ones([3], dtype="float32")
+            )
+            return (
+                dy_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if (paddle.is_compiled_with_cuda() or is_custom_device()):
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+
+
+    # case8: ddy is none, dims = 1, complex dtype
+    def test_matmul_double_grad_case8(self):
+        input_numpy_x = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_y = np.random.random([3]).astype(
+            'float32'
+        ) + 1j * np.random.random([3]).astype('float32')
+        input_numpy_x_conj = np.conjugate(input_numpy_x)
+
+        def actual():
+            x = paddle.to_tensor(
+                input_numpy_x, stop_gradient=False, dtype='complex64'
+            )
+            y = paddle.to_tensor(
+                input_numpy_y, stop_gradient=False, dtype='complex64'
+            )
+            out = paddle.matmul(x, y, False, False)
+
+            dout = paddle.to_tensor(
+                np.ones([1]), stop_gradient=False, dtype='complex64'
+            )
+            (dy,) = paddle.grad(
+                [out], [y], [dout], retain_graph=True, create_graph=True
+            )
+            ddy = paddle.to_tensor(
+                np.ones([3]), stop_gradient=False, dtype='complex64'
+            )
+            dx_double_grad, ddout = paddle.grad(
+                [dy],
+                [x, dout],
+                [ddy],
+                retain_graph=True,
+                create_graph=True,
+            )
+            return dx_double_grad, ddout
+
+        def expected():
+            dx_double_grad_expected = np.ones([3], dtype="float32")
+            ddout_expected = np.matmul(
+                input_numpy_x_conj, np.ones([3], dtype="float32")
+            )
+            return (
+                dx_double_grad_expected,
+                ddout_expected,
+            )
+
+        expected_results = expected()
+        places = ["cpu"]
+        if (paddle.is_compiled_with_cuda() or is_custom_device()):
+            places.append(get_device())
+        for place in places:
+            paddle.device.set_device(place)
+            actual_results = actual()
+            for expected_result, actual_result in zip(
+                expected_results, actual_results
+            ):
+                np.testing.assert_allclose(
+                    expected_result, actual_result, rtol=1e-6
+                )
+    """
+
+    def test_value_error(self):
+        def test():
+            import paddle
+            from paddle import nn
+
+            model = nn.Sequential(nn.Linear(3, 4))
+
+            x = paddle.randn([4, 1])
+            y = paddle.randn([4, 1])
+            z = paddle.randn([4, 1])
+            x.stop_gradient = False
+            y.stop_gradient = False
+            z.stop_gradient = False
+            out = model(paddle.concat((x, y, z), axis=1))
+
+            data = {
+                "x": x,
+                "y": y,
+                "z": z,
+                "u": out[:, 0:1],
+                "v": out[:, 1:2],
+                "w": out[:, 2:3],
+                "p": out[:, 3:4],
+            }
+
+            v = out[:, 1:2]
+            z = paddle.grad(v, x, create_graph=True)[0]
+            zz = paddle.grad(z, x, create_graph=True)[0]
+
+        with self.assertRaises(ValueError):
+            test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/metax_gpu/tests/unit_test/test_linalg_matrix_exp_metax.py b/backends/metax_gpu/tests/unit_test/test_linalg_matrix_exp_metax.py
new file mode 100644
index 00000000000..e39de09d6e4
--- /dev/null
+++ b/backends/metax_gpu/tests/unit_test/test_linalg_matrix_exp_metax.py
@@ -0,0 +1,268 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import unittest
+
+import numpy as np
+import scipy
+from op_test import get_places
+
+import paddle
+
+from paddle.base import core
+
+core.set_cublas_switch(False)
+
+os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
+
+if sys.platform == "win32":
+    RTOL = {"float32": 1e-02, "float64": 1e-04}
+    ATOL = {"float32": 1e-02, "float64": 1e-04}
+elif sys.platform == "darwin":
+    RTOL = {"float32": 1e-06, "float64": 1e-12}
+    ATOL = {"float32": 1e-06, "float64": 1e-12}
+elif scipy.__version__ < "1.15":
+    RTOL = {"float32": 1e-06, "float64": 1e-15}
+    ATOL = {"float32": 1e-06, "float64": 1e-15}
+else:
+    RTOL = {"float32": 1e-06, "float64": 1e-13}
+    ATOL = {"float32": 1e-06, "float64": 1e-13}
+
+
+class MatrixExpTestCase(unittest.TestCase):
+    def setUp(self):
+        self.init_config()
+        self.generate_input()
+        self.generate_output()
+        self.places = get_places()
+
+    def generate_input(self):
+        self._input_shape = (5, 5)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+    def generate_output(self):
+        self._output_data = scipy.linalg.expm(self._input_data)
+
+    def init_config(self):
+        self.dtype = "float64"
+
+    def test_dygraph(self):
+        for place in self.places:
+            paddle.disable_static(place)
+            x = paddle.to_tensor(self._input_data, place=place)
+            out = paddle.linalg.matrix_exp(x).numpy()
+
+            np.testing.assert_allclose(
+                out,
+                self._output_data,
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    # TODO(megemini): cond/while_loop should be tested in pir
+    #
+    def test_static(self):
+        paddle.enable_static()
+
+        for place in get_places():
+            with paddle.static.program_guard(
+                paddle.static.Program(), paddle.static.Program()
+            ):
+                x = paddle.static.data(
+                    name="input",
+                    shape=self._input_shape,
+                    dtype=self._input_data.dtype,
+                )
+
+                out = paddle.linalg.matrix_exp(x)
+                exe = paddle.static.Executor(place)
+
+                res = exe.run(
+                    feed={"input": self._input_data},
+                    fetch_list=[out],
+                )[0]
+
+            np.testing.assert_allclose(
+                res,
+                self._output_data,
+                rtol=RTOL.get(self.dtype),
+                atol=ATOL.get(self.dtype),
+            )
+
+    def test_grad(self):
+        for place in self.places:
+            x = paddle.to_tensor(self._input_data, place=place, stop_gradient=False)
+            out = paddle.linalg.matrix_exp(x)
+            out.backward()
+            x_grad = x.grad
+
+            self.assertEqual(list(x_grad.shape), list(x.shape))
+            self.assertEqual(x_grad.dtype, x.dtype)
+
+
+class MatrixExpTestCaseFloat32(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCase3D(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = (2, 5, 5)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCase3DFloat32(MatrixExpTestCase3D):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCase4D(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = (2, 3, 5, 5)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCase4DFloat32(MatrixExpTestCase4D):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCaseEmpty(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = ()
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCaseEmptyFloat32(MatrixExpTestCaseEmpty):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+class MatrixExpTestCaseScalar(MatrixExpTestCase):
+    def generate_input(self):
+        self._input_shape = (2, 3, 1, 1)
+        np.random.seed(123)
+        self._input_data = np.random.random(self._input_shape).astype(self.dtype)
+
+
+class MatrixExpTestCaseScalarFloat32(MatrixExpTestCaseScalar):
+    def init_config(self):
+        self.dtype = "float32"
+
+
+# test precision for float32 with l1_norm comparing `conds`
+class MatrixExpTestCasePrecisionFloat32L1norm0(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.2], [-0.2, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat32L1norm1(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.8], [-0.8, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat32L1norm2(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float32"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 2.0], [-2.0, 0]]).astype(self.dtype)
+
+
+# test precision for float64 with l1_norm comparing `conds`
+class MatrixExpTestCasePrecisionFloat64L1norm0(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.01], [-0.01, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm1(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.1], [-0.1, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm2(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 0.5], [-0.5, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm3(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 1.5], [-1.5, 0]]).astype(self.dtype)
+
+
+class MatrixExpTestCasePrecisionFloat64L1norm4(MatrixExpTestCase):
+    def init_config(self):
+        self.dtype = "float64"
+
+    def generate_input(self):
+        self._input_shape = (2, 2)
+        self._input_data = np.array([[0, 2.5], [-2.5, 0]]).astype(self.dtype)
+
+
+# test error cases
+class MatrixExpTestCaseError(unittest.TestCase):
+    def test_error_dtype(self):
+        with self.assertRaises(ValueError):
+            x = np.array(123, dtype=int)
+            paddle.linalg.matrix_exp(x)
+
+    def test_error_ndim(self):
+        # 1-d
+        with self.assertRaises(ValueError):
+            x = np.random.rand(1)
+            paddle.linalg.matrix_exp(x)
+
+        # not square
+        with self.assertRaises(ValueError):
+            x = np.random.rand(3, 4)
+            paddle.linalg.matrix_exp(x)
+
+        with self.assertRaises(ValueError):
+            x = np.random.rand(2, 3, 4)
+            paddle.linalg.matrix_exp(x)
+
+
+if __name__ == "__main__":
+    unittest.main()