From 762cf919abd7344c37773414c998186649142cb8 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Tue, 9 Sep 2025 16:04:01 +0800
Subject: [PATCH 1/9] [Metax] fix dgc & mklml compile product path problem

---
 backends/metax_gpu/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index 787aae13e40..b6f9abccf14 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -26,6 +26,10 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")
 set(WITH_MKLML ON)
 
+set(THIRD_PARTY_PATH
+    "${PADDLE_SOURCE_DIR}/build/third_party"
+    CACHE PATH "Third party libraries directory.")
+
 include(paddle)
 set(THIRD_PARTY_PATH
     "${PADDLE_SOURCE_DIR}/build/third_party"

From 9249b1ecd662bf6d266ef404834a245eec81b115 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Thu, 11 Sep 2025 17:28:51 +0800
Subject: [PATCH 2/9] [Metax] update metax_gpu CMakeLists.txt

---
 backends/metax_gpu/CMakeLists.txt | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt
index b6f9abccf14..787aae13e40 100755
--- a/backends/metax_gpu/CMakeLists.txt
+++ b/backends/metax_gpu/CMakeLists.txt
@@ -26,10 +26,6 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}")
 set(WITH_MKLML ON)
 
-set(THIRD_PARTY_PATH
-    "${PADDLE_SOURCE_DIR}/build/third_party"
-    CACHE PATH "Third party libraries directory.")
-
 include(paddle)
 set(THIRD_PARTY_PATH
     "${PADDLE_SOURCE_DIR}/build/third_party"

From 6634ef1611a035f00948245fe3aaaf5d79f97366 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Tue, 16 Sep 2025 18:12:17 +0800
Subject: [PATCH 3/9] [Metax] organize documents

---
 .../calc_reduced_attn_kernel_register.cu      |   2 +-
 backends/metax_gpu/kernels/funcs/softmax.cu   |   2 +-
 .../kernels/funcs/values_vectors_functor.h    |   2 +-
 .../metax_gpu/kernels/gpudnn/conv_cudnn_v7.h  |   2 +-
 .../conv_transpose_grad_kernel_register.cu    |   2 +-
 .../kernels/gpudnn/pool_kernel_register.cu    |   2 +-
 .../metax_gpu/kernels/gpudnn/softmax_gpudnn.h |   2 +-
 .../kernels/impl/dirichlet_kernel_impl.h      |   2 +-
 .../addmm_grad_kernel_register.cu             |   0
 .../addmm_kernel_register.cu                  |   0
 .../batch_fc_grad_kernel_register.cu          |   0
 .../batch_norm_grad_kernel_register.cu        |   2 +-
 .../batch_norm_kernel_register.cu             |   0
 .../bilinear_grad_kernel_register.cu          |   0
 .../bilinear_kernel_register.cu               |   0
 .../metax_kernel/blha_get_max_len_register.cu |   2 +-
 .../bmm_grad_kernel_register.cu               |   0
 .../bmm_kernel_register.cu                    |   0
 ...abel_cross_entropy_grad_kernel_register.cu |   0
 .../cholesky_grad_kernel_register.cu          |   0
 .../metax_kernel/cholesky_kernel_register.cu  |   2 +-
 .../conv_kernel_register.cu                   |   0
 .../conv_transpose_kernel_register.cu         |   0
 .../crop_kernel_register.cu                   |   0
 .../cross_entropy_kernel_register.cu          |   2 +-
 .../depthwise_conv_grad_kernel.cu             |   0
 .../depthwise_conv_kernel.cu                  |   0
 .../kernels/{ => metax_kernel}/elementwise.h  |   0
 .../{ => metax_kernel}/flags_declare.cu       |   0
 .../flash_attn_grad_kernel.cu                 |   0
 .../{ => metax_kernel}/flash_attn_kernel.cu   |   0
 .../{ => metax_kernel}/flash_attn_kernel.h    |   0
 .../{ => metax_kernel}/flash_attn_utils.h     |   0
 .../kernels/{ => metax_kernel}/flashattn.cc   |   0
 .../kernels/{ => metax_kernel}/flashattn.h    |   0
 .../flatten2_grad_kernel_register.cu          |   0
 .../flatten2_kernel_register.cu               |   0
 .../fused_conv2d_add_act_kernel_register.cu   |   3 +-
 .../fused_rope_grad_kernel_register.cu        |   0
 .../fused_rope_kernel_register.cu             |   0
 .../instance_norm_grad_kerne_registerl.cu     |   2 +-
 .../instance_norm_kernel_register.cu          |   2 +-
 .../layer_norm_grad_kernel_register.cu        |   0
 .../layer_norm_kernel_register.cu             |   0
 .../lstm_kernel_register.cu                   |   0
 .../metax_kernel/lu_kernel_register.cu        |   2 +-
 .../lu_solve_grad_kernel_register.cu          |   0
 .../metax_kernel/matrix_rank_tol_kernel.cu    |   2 +-
 .../{ => metax_kernel}/metax_context.cc       |  24 +--
 .../{ => metax_kernel}/metax_context.h        |   6 +-
 .../multi_dot_grad_kernel_register.cu         |   0
 .../multi_dot_kernel_register.cu              |   0
 .../mv_grad_kernel_register.cu                |   0
 .../mv_kernel_register.cu                     |   0
 .../metax_kernel/qr_kernel_register.cu        |   2 +-
 .../rank_attention_grad_kernel_register.cu    |   0
 .../rank_attention_kernel_register.cu         |   0
 .../metax_kernel/rnn_grad_kernel.cu.cc        |   2 +-
 .../kernels/metax_kernel/rnn_kernel.cu.cc     |   2 +-
 .../slogdeterminant_kernel_register.cu        |   0
 .../softmax_kernel_grad_register.cu           |   0
 .../softmax_kernel_register.cu                |   0
 .../solve_grad_kernel_register.cu             |   0
 .../standard_gamma_kernel_register.cu         |   0
 .../stft_kernel_register.cu                   |   0
 .../svd_kernel_register.cu                    |   0
 .../top_k_grad_kernel_register.cu             |   0
 .../triangular_solve_grad_kernel_register.cu  |   0
 .../triangular_solve_kernel_register.cu       |   0
 .../warprnnt_kernel_register.cu               |   0
 .../weight_only_linear_kernel.cu              |   0
 .../weight_quantize_kernel_register.cu        |   0
 backends/metax_gpu/patch/paddle.patch         | 204 +++++++++---------
 backends/metax_gpu/tests/CMakeLists.txt       |  54 ++---
 74 files changed, 166 insertions(+), 163 deletions(-)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_fc_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_norm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/cholesky_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_transpose_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/crop_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_grad_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/elementwise.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flags_declare.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_grad_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_utils.h (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.cc (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.h (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_conv2d_add_act_kernel_register.cu (99%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lstm_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lu_solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.cc (90%)
 rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.h (96%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/slogdeterminant_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_grad_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/standard_gamma_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/stft_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/svd_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/top_k_grad_kernel_register.cu (100%)
 mode change 100755 => 100644
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_grad_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/warprnnt_kernel_register.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_only_linear_kernel.cu (100%)
 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_quantize_kernel_register.cu (100%)

diff --git a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
index 11def2c9ee4..2aa8424f0b1 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
+++ b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/flash_attn_utils.h"
+#include "kernels/metax_kernel/flash_attn_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/calc_reduced_attn_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu
index d738a53f43a..44bfd02a308 100644
--- a/backends/metax_gpu/kernels/funcs/softmax.cu
+++ b/backends/metax_gpu/kernels/funcs/softmax.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
index ec429950872..8c5996e680b 100644
--- a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
+++ b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h
@@ -24,7 +24,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/common/errors.h"
 #endif
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
index da61a1e5b41..a0f89047045 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
+++ b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "glog/logging.h"
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
 #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
index 0067818d165..b7eebfcee2e 100644
--- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 
 #include "kernels/gpudnn/conv_cudnn_v7.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/ddim.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/backends/dynload/cudnn.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
index c115f5ad930..1c2bfeedf34 100644
--- a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
+++ b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "gpudnn/pool_gpudnn.h"
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
diff --git a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
index 168752700e9..5844886ad1b 100644
--- a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
+++ b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h
@@ -25,7 +25,7 @@
 #include "paddle/phi/kernels/primitive/kernel_primitives.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 
diff --git a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
index 70af87513e5..c2e2e341bf5 100644
--- a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h
@@ -17,7 +17,7 @@
 #include <cmath>
 #include <random>
 
-#include "kernels/elementwise.h"
+#include "kernels/metax_kernel/elementwise.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/amp_type_traits.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
index 062646bbf9d..52fe5a1d566 100644
--- a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/flags.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
index bc9eb23c0e8..42810569fde 100644
--- a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/flash_attn_utils.h"
 #include "kernels/metax_kernel/block_attn.h"
+#include "kernels/metax_kernel/flash_attn_utils.h"
 #include "paddle/phi/backends/context_pool.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
index e8fae2d9da5..8a39ae3f0a8 100644
--- a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
index e94862ec7b0..043a64dc149 100644
--- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/kernels/cross_entropy_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu
diff --git a/backends/metax_gpu/kernels/elementwise.h b/backends/metax_gpu/kernels/metax_kernel/elementwise.h
similarity index 100%
rename from backends/metax_gpu/kernels/elementwise.h
rename to backends/metax_gpu/kernels/metax_kernel/elementwise.h
diff --git a/backends/metax_gpu/kernels/flags_declare.cu b/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flags_declare.cu
rename to backends/metax_gpu/kernels/metax_kernel/flags_declare.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_grad_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu
diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_kernel.h
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h
diff --git a/backends/metax_gpu/kernels/flash_attn_utils.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h
similarity index 100%
rename from backends/metax_gpu/kernels/flash_attn_utils.h
rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h
diff --git a/backends/metax_gpu/kernels/flashattn.cc b/backends/metax_gpu/kernels/metax_kernel/flashattn.cc
similarity index 100%
rename from backends/metax_gpu/kernels/flashattn.cc
rename to backends/metax_gpu/kernels/metax_kernel/flashattn.cc
diff --git a/backends/metax_gpu/kernels/flashattn.h b/backends/metax_gpu/kernels/metax_kernel/flashattn.h
similarity index 100%
rename from backends/metax_gpu/kernels/flashattn.h
rename to backends/metax_gpu/kernels/metax_kernel/flashattn.h
diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
similarity index 99%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
index 48809ceefa4..c0d15b7f1b4 100644
--- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu
@@ -308,7 +308,8 @@ class CudnnConvDescManager {
       int groups,
       cudnnDataType_t dtype) {
     auto* desc = new phi::backends::gpu::ConvolutionDescriptor();
-    desc->set(dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
+    desc->set(
+        dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups);
     return desc;
   }
 
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
index d7540d949a9..bdf341f5a35 100644
--- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
index db975d74665..e0c0ae9c1d6 100644
--- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/common/layout.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/layer_norm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
index 5a2d85418a1..72e4c5b2b79 100644
--- a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/backends/dynload/cusolver.h"
 #endif
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
index bda5dc62f1a..d8c3355e6e4 100644
--- a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu
@@ -18,7 +18,7 @@
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/dynload/cusolver.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/common/type_traits.h"
diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
similarity index 90%
rename from backends/metax_gpu/kernels/metax_context.cc
rename to backends/metax_gpu/kernels/metax_kernel/metax_context.cc
index f0c92f00565..62aaa5fb2de 100644
--- a/backends/metax_gpu/kernels/metax_context.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc
@@ -12,27 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 
 namespace phi {
 const bool allow_tf32_cublas = []() -> bool {
-    const char* v = std::getenv("ALLOW_TF32_CUBLAS");
-    if (v) {
-      return std::atoi(v);
-    }
-    return false;
+  const char* v = std::getenv("ALLOW_TF32_CUBLAS");
+  if (v) {
+    return std::atoi(v);
+  }
+  return false;
 }();
 
 const bool allow_tf32_cudnn = []() -> bool {
-    const char* v = std::getenv("ALLOW_TF32_CUDNN");
-    if (v) {
-      return std::atoi(v);
-    }
-    return false;
+  const char* v = std::getenv("ALLOW_TF32_CUDNN");
+  if (v) {
+    return std::atoi(v);
+  }
+  return false;
 }();
 
 bool AllowTF32Cublas() { return allow_tf32_cublas; }
-bool AllowTF32Cudnn()  { return allow_tf32_cudnn;  }
+bool AllowTF32Cudnn() { return allow_tf32_cudnn; }
 
 void DnnWorkspaceHandle::RunFuncSync(
     const std::function<void(void*)>& cudnn_func,
diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
similarity index 96%
rename from backends/metax_gpu/kernels/metax_context.h
rename to backends/metax_gpu/kernels/metax_kernel/metax_context.h
index 683a6df7017..a6610c1dab2 100644
--- a/backends/metax_gpu/kernels/metax_context.h
+++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
-#define BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
+#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
+#define BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
 #include <array>
 #include <functional>
 #include <mutex>
@@ -161,4 +161,4 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor,
   return DnnWorkspaceHandle(alloactor, stream);
 }
 }  // namespace phi
-#endif  // BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_
+#endif  // BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
index 745069e2eda..c3041254444 100644
--- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
+++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu
@@ -22,7 +22,7 @@
 #include <algorithm>
 #include <vector>
 
-#include "kernels/metax_context.h"
+#include "kernels/metax_kernel/metax_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/enforce.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
index 499832049e4..101b51aa350 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/rnn_grad_kernel.h"
 
-#include "kernels/metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
index f1cf9e09dc7..2598ce093e6 100644
--- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
+++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/rnn_kernel.h"
 
 #include "glog/logging.h"
-#include "kernels/metax_context.h"  //NOLINT
+#include "kernels/metax_kernel/metax_context.h"  //NOLINT
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/generator.h"
 #include "paddle/phi/core/kernel_registry.h"
diff --git a/backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu
old mode 100755
new mode 100644
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu
rename to backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu
diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
similarity index 100%
rename from backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu
rename to backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu
diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch
index 0283a443adb..e56826c4f3e 100755
--- a/backends/metax_gpu/patch/paddle.patch
+++ b/backends/metax_gpu/patch/paddle.patch
@@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644
 -  set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
 +  # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header})
  endif()
- 
+
  set(EIGEN_INCLUDE_DIR ${SOURCE_DIR})
 diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc
 index bff0f2bf70..9376b5781f 100644
 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc
 +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc
 @@ -16,7 +16,7 @@
- 
+
  #include <cstdio>
- 
+
 -#include "paddle/fluid/platform/enforce.h"
 +// #include "paddle/fluid/platform/enforce.h"
  #include "paddle/phi/core/os_info.h"
@@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnDestroyActivationDescriptor);               \
 +  __macro(cudnnSetRNNDescriptor_v6);
  CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
- 
+
  #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000
 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \
@@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644
 +  __macro(cudnnRNNForwardInferenceEx);
  CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
- 
+
 @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
  #endif
- 
+
 -#if CUDNN_VERSION < 90000
 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \
 -  __macro(cudnnGetRNNParamsSize);                     \
@@ -132,15 +132,15 @@ index c0080f0a5e..458ca3e2e8 100644
 -#endif
  }  // namespace dynload
  }  // namespace phi
- 
+
 diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h
 index 1547909d92..66b2779392 100644
 --- a/paddle/phi/backends/dynload/cufft.h
 +++ b/paddle/phi/backends/dynload/cufft.h
 @@ -1,3 +1,4 @@
-+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name);
          cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle();        \
@@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644
 @@ -24,8 +24,8 @@ limitations under the License. */
  #include "paddle/phi/backends/dynload/dynamic_loader.h"
  #include "paddle/phi/common/port.h"
- 
+
 -namespace phi {
 -namespace dynload {
 +// namespace phi {
 +// namespace dynload {
- 
+
  extern std::once_flag cupti_dso_flag;
  extern void *cupti_dso_handle;
 @@ -71,7 +71,7 @@ extern void *cupti_dso_handle;
  CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP);
- 
+
  #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP
 -}  // namespace dynload
 -}  // namespace phi
 +// }  // namespace dynload
 +// }  // namespace phi
- 
+
 -#endif  // PADDLE_WITH_CUPTI
 +#endif  // PADDLE_WITH_CUPTI
 \ No newline at end of file
@@ -230,28 +230,28 @@ index 4ff2e528a9..81421c8ca1 100644
 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h
 @@ -1,3 +1,4 @@
-+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.   
++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved.
  /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- 
+
  Licensed under the Apache License, Version 2.0 (the "License");
 @@ -25,7 +26,7 @@ namespace phi {
  namespace backends {
  namespace gpu {
- 
+
 -#define FULL_WARP_MASK 0xFFFFFFFF
 +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL
  #define CREATE_SHFL_MASK(mask, predicate) \
    mask = __ballot_sync(FULL_WARP_MASK, (predicate))
- 
+
 @@ -45,12 +46,12 @@ namespace gpu {
- 
+
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) {
 +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) {
    return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
  }
- 
+
  template <typename T>
 -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
 +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask,
@@ -259,7 +259,7 @@ index 4ff2e528a9..81421c8ca1 100644
                                                  int width = warpSize) {
    return __shfl_xor_sync(mask, val, width);
 @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask,
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::float16 val, int delta, int width) {
@@ -267,7 +267,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::float16(__shfl_down_sync(
        mask, val.to_half(), static_cast<unsigned>(delta), width));
  }
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int delta, int width) {
@@ -276,7 +276,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(__shfl_down_sync(
        mask, val.to_nv_bfloat16(), static_cast<unsigned>(delta), width));
 @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<float> val, int delta, int width) {
@@ -285,7 +285,7 @@ index 4ff2e528a9..81421c8ca1 100644
        mask, static_cast<float>(val.real), static_cast<unsigned>(delta), width));
    float imag = static_cast<float>(__shfl_down_sync(
 @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
 -    unsigned mask, phi::dtype::complex<double> val, int delta, int width) {
@@ -294,14 +294,14 @@ index 4ff2e528a9..81421c8ca1 100644
        static_cast<double>(__shfl_down_sync(mask,
                                             static_cast<double>(val.real),
 @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleDownSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::float16 val, int width) {
 +    unsigned long long mask, phi::dtype::float16 val, int width) {
    return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width));
  }
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::bfloat16 val, int width) {
@@ -310,7 +310,7 @@ index 4ff2e528a9..81421c8ca1 100644
    return phi::dtype::bfloat16(
        __shfl_xor_sync(mask, val.to_nv_bfloat16(), width));
 @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<float> val, int width) {
@@ -319,7 +319,7 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<float>(val.real), width));
    float imag = static_cast<float>(
 @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex<float> CudaShuffleXorSync(
- 
+
  template <>
  __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
 -    unsigned mask, phi::dtype::complex<double> val, int width) {
@@ -328,14 +328,14 @@ index 4ff2e528a9..81421c8ca1 100644
        __shfl_xor_sync(mask, static_cast<double>(val.real), width));
    double imag = static_cast<double>(
 @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex<double> CudaShuffleXorSync(
- 
+
  template <typename T>
  __forceinline__ __device__ T
 -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) {
 +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) {
    return __shfl_sync(mask, val, src_line, width);
  }
- 
+
 @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) {
    // but most card's warp size is 32.
    const int warpSize = 32;
@@ -343,7 +343,7 @@ index 4ff2e528a9..81421c8ca1 100644
 -  unsigned mask = 0u;
 +  unsigned long long mask = 0ull;
    CREATE_SHFL_MASK(mask, tid < len);
- 
+
    for (int offset = warpSize / 2; offset > 0; offset /= 2)
 diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h
 index 024a7de73e..1e4cdf16be 100644
@@ -351,7 +351,7 @@ index 024a7de73e..1e4cdf16be 100644
 +++ b/paddle/phi/core/enforce.h
 @@ -45,7 +45,9 @@ limitations under the License. */
  #endif
- 
+
  #ifdef PADDLE_WITH_CUDA
 -#include "paddle/phi/backends/dynload/cublas.h"
 +// #include "paddle/phi/backends/dynload/../../../../../cublas.h"
@@ -361,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644
  #include "paddle/phi/backends/dynload/curand.h"
  #include "paddle/phi/backends/dynload/cusolver.h"
 @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; }
- 
+
  void ThrowWarnInternal(const std::string& message);
- 
+
 -#if defined(__CUDA_ARCH__)
 +#if defined(__CUDACC__)
  // For cuda, the assertions can affect performance and it is therefore
@@ -379,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644
    } while (0)
  #elif defined(__HIPCC__)
 @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) {
- 
+
  }  // namespace enforce
  using namespace enforce;  // NOLINT
 -}  // namespace phi
@@ -392,7 +392,7 @@ index c646e487d0..325122175c 100644
 @@ -25,8 +25,9 @@
  #else
  #include <cuda_runtime.h>
- 
+
 -#include "paddle/phi/backends/dynload/cublas.h"
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublas.h"
@@ -400,16 +400,16 @@ index c646e487d0..325122175c 100644
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #include "paddle/phi/backends/dynload/cudnn.h"
  #endif
- 
+
 @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode,
- 
+
  // TODO(Ming Huang): Since there is no blasLt handler,
  // use rocblas_handle for workaround.
 -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
 +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle);
- 
+
  #undef DECLARE_TYPE_FOR_GPU
- 
+
 diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h
 index 2d02eb370b..8a7233e34e 100644
 --- a/paddle/phi/core/platform/device_context.h
@@ -430,58 +430,58 @@ index d69eb67d6f..1d8b6e9375 100644
 --- a/paddle/phi/kernels/cpu/index_select_impl.h
 +++ b/paddle/phi/kernels/cpu/index_select_impl.h
 @@ -18,7 +18,7 @@
- 
+
  #include "paddle/phi/core/dense_tensor.h"
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
 index cb35feee32..64f5bd24ac 100644
 --- a/paddle/phi/kernels/funcs/fc_functor.cu
 +++ b/paddle/phi/kernels/funcs/fc_functor.cu
 @@ -16,12 +16,12 @@ limitations under the License. */
- 
+
  #include "paddle/phi/backends/all_context.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/fc_functor.h"
- 
+
  #include "paddle/phi/backends/gpu/gpu_launch_config.h"
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
 +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h"
  #include "paddle/phi/kernels/funcs/quant_dequant.h"
  #include "paddle/phi/kernels/matmul_kernel.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu
 index 88663ec880..98b93072a3 100644
 --- a/paddle/phi/kernels/funcs/gru_compute.cu
 +++ b/paddle/phi/kernels/funcs/gru_compute.cu
 @@ -12,7 +12,7 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/gru_compute.h"
- 
+
  #include "paddle/phi/backends/gpu/gpu_context.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h"
  #include "paddle/phi/kernels/funcs/detail/gru_kernel.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h
 index 15e1a4a3c3..e4780538d7 100644
 --- a/paddle/phi/kernels/funcs/math/context_project.h
 +++ b/paddle/phi/kernels/funcs/math/context_project.h
 @@ -18,7 +18,7 @@
  #include <vector>
- 
+
  #include "paddle/phi/core/tensor_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/im2col.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu
 index e101224970..a52eb6096f 100644
@@ -489,14 +489,14 @@ index e101224970..a52eb6096f 100644
 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu
 @@ -15,11 +15,13 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
  #include "paddle/phi/common/memory_utils.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
- 
+
  namespace phi {
  namespace funcs {
- 
+
 +
 +
  template <typename Context, typename T>
@@ -514,19 +514,19 @@ index 558d363b39..05da04b517 100644
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
  #include "paddle/phi/kernels/funcs/scatter.cu.h"
- 
+
 diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 index 8b0baf5f5f..260482f124 100644
 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu
 @@ -27,7 +27,7 @@ namespace cub = hipcub;
- 
+
  #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h"
- 
+
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/math_cuda_utils.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
 index e30d440ff3..3c74792690 100644
@@ -535,7 +535,7 @@ index e30d440ff3..3c74792690 100644
 @@ -30,11 +30,11 @@ limitations under the License. */
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
  #include "paddle/phi/kernels/primitive/functor_primitives.h"
- 
+
 -#define FINAL_MASK 0xffffffff
 +#define FINAL_MASK 0xffffffffffffffffull
  #ifdef PADDLE_WITH_HIP
@@ -545,7 +545,7 @@ index e30d440ff3..3c74792690 100644
 +#define WARP_SIZE 64
  #endif
  #define MAX_NUM_THREADS 1024
- 
+
 @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair<T> topk[],
    for (int k = beam_size - 2; k >= 0; k--) {
      if (largest) {
@@ -606,7 +606,7 @@ index e30d440ff3..3c74792690 100644
 +  topk[0 + offset].v = p.v;
 +  topk[0 + offset].id = p.id;
  }
- 
+
  template <typename T, int BlockSize>
 @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair<T> topk[],
  template <typename T, int BlockSize>
@@ -662,7 +662,7 @@ index e30d440ff3..3c74792690 100644
 +            // topk + MaxLength - *beam, src, tid, dim, *max, length, largest);
        }
      }
- 
+
 @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
        shared_max[wid] = input_now;
      }
@@ -697,7 +697,7 @@ index e30d440ff3..3c74792690 100644
 -    if (--(*k) == 0) break;
 +    // if (--(*k) == 0) break;
 +    unsigned long long mask = 0ull;
- 
+
 -    unsigned mask = 0u;
 +    // unsigned mask = 0u;
      CREATE_SHFL_MASK(mask, true);
@@ -721,14 +721,14 @@ index e30d440ff3..3c74792690 100644
 +
      return ret;
    }
- 
+
    static __device__ __forceinline__ unsigned int SetBitfield(
        unsigned int val, unsigned int to_insert, int pos, int len) {
      unsigned int ret;
 -    asm("bfi.b32 %0, %1, %2, %3, %4;"
 -        : "=r"(ret)
 -        : "r"(to_insert), "r"(val), "r"(pos), "r"(len));
-+    
++
 +    ret = (static_cast<unsigned int>(val) << (32 - pos - len)) >> (32 - len);
      return ret;
    }
@@ -738,12 +738,12 @@ index e30d440ff3..3c74792690 100644
                                                           int len) {
      uint64_t ret;
 -    asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len));
-+    
++
 +
 +    ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
      return ret;
    }
- 
+
 @@ -507,9 +556,9 @@ struct Bitfield<uint64_t> {
                                                           int pos,
                                                           int len) {
@@ -751,7 +751,7 @@ index e30d440ff3..3c74792690 100644
 -    asm("bfi.b64 %0, %1, %2, %3, %4;"
 -        : "=l"(ret)
 -        : "l"(to_insert), "l"(val), "r"(pos), "r"(len));
-+    
++
 +  ret = (static_cast<uint64_t>(val) << (64 - pos - len)) >> (64 - len);
 +
      return ret;
@@ -763,7 +763,7 @@ index e30d440ff3..3c74792690 100644
    int lane_id;
 -  asm("mov.s32 %0, %%laneid;" : "=r"(lane_id));
 -  return lane_id;
-+  
++
 +// // >>>> PTX2CPP Success <<<<
 +// {
 +// (lane_id)=(threadIdx.x&(warpSize-1));
@@ -771,7 +771,7 @@ index e30d440ff3..3c74792690 100644
 +  return ::__lane_id();
 +  // return lane_id;
  }
- 
+
  __device__ __forceinline__ unsigned GetLaneMaskLe() {
    unsigned mask;
 -  asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask));
@@ -780,17 +780,17 @@ index e30d440ff3..3c74792690 100644
 +  return ((uint64_t(1) << ::__lane_id()) << 1) - 1;
 +  // return mask;
  }
- 
+
  template <typename T, bool KillDependency, class Function>
 @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input,
- 
+
    // 1. Find the k-th value
    T kth_value = static_cast<T>(0);
 -  RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  // RadixSearch<T, RadixTypeConfig<T>::RadixType, IndexType, false>(
 +  RadixSearch<T, typename RadixTypeConfig<T>::RadixType, IndexType, false>(
        cur_input, k, num_cols, shared_mem, &kth_value);
- 
+
    __shared__ int64_t block_min_idx;
 @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx,
  }
@@ -803,12 +803,12 @@ index 32db61532f..0220316bc3 100644
 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h
 @@ -15,7 +15,7 @@
  #pragma once
- 
+
  #if defined(PADDLE_WITH_CUDA)
 -#include "paddle/phi/backends/dynload/cublasLt.h"
 +// #include "paddle/phi/backends/dynload/cublasLt.h"
  #endif
- 
+
  #include "glog/logging.h"
 diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h
 index 9d4bb18d55..ea42cc10a9 100644
@@ -830,12 +830,12 @@ index b8cfdbf3ce..fa14b94a77 100644
 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu
 @@ -14,7 +14,7 @@
- 
+
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
- 
+
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
@@ -843,12 +843,12 @@ index e838778952..83e805e75a 100644
 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu
 @@ -14,7 +14,7 @@
- 
+
  #include "paddle/phi/core/kernel_registry.h"
  #include "paddle/phi/kernels/funcs/aligned_vector.h"
 -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h"
 +#include "kernels/metax_kernel/mmha_util.cu.h"
- 
+
  namespace phi {
  namespace fusion {
 diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -863,7 +863,7 @@ index f0cca0f701..02ea957240 100644
 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h"
 +#include "kernels/gpudnn/conv_gpudnn.h"
 +#include "kernels/impl/conv_cudnn_impl.h"
- 
+
  namespace phi {
  // To determine use cudnn or not.
 diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h
@@ -890,7 +890,7 @@ index 29fa252e96..4ae72b0935 100644
 +// #endif
    return tanhf(x);
  }
- 
+
 diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
 index 11efd87965..679db14c24 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu
@@ -901,9 +901,9 @@ index 11efd87965..679db14c24 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
- 
+
  namespace phi {
- 
+
 diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu
 index 63c35dd4ee..15da9aea45 100644
 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu
@@ -914,9 +914,9 @@ index 63c35dd4ee..15da9aea45 100644
  #include "paddle/phi/kernels/funcs/math_function.h"
 -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 +#include "kernels/gpudnn/softmax_gpudnn.h"
- 
+
  namespace phi {
- 
+
 diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu
 index 1bdbe1564c..f753b54bc6 100644
 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu
@@ -948,7 +948,7 @@ index cf80666b4e..ca76e055fb 100644
 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h
 @@ -19,7 +19,7 @@ limitations under the License. */
- 
+
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_grad_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -961,14 +961,14 @@ index 2789cb59a2..b91b076f7f 100644
 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h
 @@ -20,7 +20,7 @@ limitations under the License. */
- 
+
  #include "paddle/phi/common/amp_type_traits.h"
  #include "paddle/phi/kernels/baddbmm_kernel.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
- 
+
 diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
 index 9a21c23666..86413d1577 100644
 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h
@@ -993,7 +993,7 @@ index 4459a931da..837c8682b8 100644
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
 index ad9e9197dd..5478d9817d 100644
@@ -1013,27 +1013,27 @@ index e6b3960f6d..564125f1f6 100644
 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h
 @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) {
- 
+
  template <typename T>
  HOSTDEVICE T igamc(const T a, const T x) {
 -  static T big = 4.503599627370496e15;
 -  static T biginv = 2.22044604925031308085e-16;
 +  const static T big = 4.503599627370496e15;
 +  const static T biginv = 2.22044604925031308085e-16;
- 
+
    if ((x <= T{0}) || (a <= T{0})) return (T{1.0});
- 
+
 diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 index 410fb3c560..009ce03440 100644
 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h
 @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) {
- 
+
  template <typename T>
  HOSTDEVICE T digamma(T x) {
 -  static T pi = T{3.14159265358979323846};
 +  const static T pi = T{3.14159265358979323846};
- 
+
    if (x == T{0.0}) {
      T inf = std::numeric_limits<T>::infinity();
 diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h
@@ -1048,12 +1048,12 @@ index 5ebbc8d2db..48acf8d0cd 100644
 -#include "paddle/phi/kernels/funcs/quant_dequant.h"
 +#include "kernels/funcs/blas/cublaslt.h"
 +#include "kernels/funcs/quant_dequant.h"
-+#include "kernels/metax_context.h"
- 
++#include "kernels/metax_kernel/metax_context.h"
+
  #pragma once
- 
+
 @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx,
- 
+
    {
      auto helper =
 -        std::make_unique<CublasLtHelper>(m, k, n, dev_ctx.cublaslt_handle());
@@ -1067,12 +1067,12 @@ index 1f319c4ae3..9186eb6906 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
- 
+
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
  namespace phi {
 diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 index 6f03f76eeb..5fe2c3e7dc 100644
@@ -1080,13 +1080,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644
 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h
 @@ -15,7 +15,7 @@ limitations under the License. */
  #pragma once
- 
+
  #include "paddle/phi/core/dense_tensor.h"
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/for_range.h"
  #include "paddle/phi/kernels/funcs/matrix_inverse.h"
- 
+
 diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h
 index 7b85903776..3f4b298807 100644
 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h
@@ -1118,14 +1118,14 @@ index 4099d8b506..baef2cd643 100644
 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h
 @@ -14,7 +14,7 @@
- 
+
  #pragma once
- 
+
 -#include "paddle/phi/kernels/funcs/blas/blas.h"
 +#include "kernels/funcs/blas/blas.h"
  #include "paddle/phi/kernels/funcs/eigen/common.h"
  #include "paddle/phi/kernels/funcs/math_function.h"
- 
+
 diff --git a/third_party/flagcx b/third_party/flagcx
 index 7c469f4af9..7e6c4cc3ca 160000
 --- a/third_party/flagcx
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 37475773026..410ef006514 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -87,32 +87,34 @@ list(
 list(
   REMOVE_ITEM
   PYTHON_TEST_SCRIPTS
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py # 受 test_sum_op.py 影响
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py # core.cudnnversion
-                                                                   # 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py # op_test.py 里
-                                                        # self._get_places()
-                                                        # 接口适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py # device == "gpu" 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py # core.cudnnversion 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py # needs check_grad with fp64
-                                                    # precision
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # op_test.py 里
-                                                   # self._get_places() 接口适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py # paddle.device.cuda.get_device_properties
-)
+  # 精度问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+  # core.cudnnversion
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+  # op_test.py 里 self._get_places()接口适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
+  # device == "gpu" 适配问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+  # paddle-gpu 报错一致
+  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+  # paddle.device.cuda.get_device_properties
+  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
+  # needs check_grad with fp64 precision
+  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+  # CUDAPinnedPlace 问题
+  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
+  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
 foreach(test_script ${PYTHON_TEST_SCRIPTS})

From c05f95f939c84359cc5bee3e3d88cb7235f55e83 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Fri, 19 Sep 2025 14:57:09 +0800
Subject: [PATCH 4/9] [Metax] add log analysis script

---
 .../metax_gpu/tests/scripts/classify.json     |  22 ++
 .../metax_gpu/tests/scripts/log_analysis.py   | 216 ++++++++++++++++++
 2 files changed, 238 insertions(+)
 create mode 100644 backends/metax_gpu/tests/scripts/classify.json
 create mode 100644 backends/metax_gpu/tests/scripts/log_analysis.py

diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json
new file mode 100644
index 00000000000..b97255adc3d
--- /dev/null
+++ b/backends/metax_gpu/tests/scripts/classify.json
@@ -0,0 +1,22 @@
+{
+    "OK":{
+        "skipped":{
+            "rule":["skipped="]
+        }
+    },
+
+    "FAILED":{
+        "precision":{
+            "rule":["Mismatched elements"]
+        },
+        "api":{
+            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"]
+        },
+        "missing":{
+            "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"]
+        },
+        "file_not_found":{
+            "rule":["FileNotFoundError:"]
+        }
+    }
+}
diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py
new file mode 100644
index 00000000000..c0716f5b6f5
--- /dev/null
+++ b/backends/metax_gpu/tests/scripts/log_analysis.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import fnmatch
+import shutil
+from enum import Enum
+
+
+class TestResult(Enum):
+    OK = "OK"
+    FAILURE = "FAILED"
+
+
+class LogAnalyzer:
+    def __init__(
+        self,
+        classify_file: str,
+        search_path: str,
+        pattern: str = None,
+        encoding: str = "utf-8",
+    ):
+        self.__patten = pattern
+        self.__search_path = search_path
+        self.__encoding = encoding
+        self.__statistical_data = {}
+
+        self.__classify_data = self.__read_json_file(classify_file)
+        for key, value in self.__classify_data.items():
+            self.__statistical_data[key] = {}
+            for sub_key in list(value.keys()):
+                self.__statistical_data[key][sub_key] = []
+
+        self.__statistical_data[TestResult.OK.value]["noskip"] = []
+        self.__statistical_data[TestResult.FAILURE.value]["other"] = []
+
+    def __read_json_file(self, path: str) -> dict:
+        with open(path, "r", encoding=self.__encoding) as f:
+            data = json.load(f)
+        f.close()
+        return data
+
+    def __check_path(self, path: str) -> None:
+        """
+        处理指定路径：
+        - 若为文件夹路径：不存在则创建，存在则清空内容
+        - 若为文件路径：不存在则创建，存在则清空内容
+        """
+        try:
+            # 判断路径是否存在
+            if os.path.exists(path):
+                # 路径存在，判断是文件还是文件夹
+                if os.path.isfile(path):
+                    # 处理文件：清空内容
+                    with open(path, "w", encoding="utf-8") as f:
+                        f.write("")  # 写入空内容清空文件
+                    # print(f"文件已存在，已清空内容: {path}")
+
+                elif os.path.isdir(path):
+                    # 处理文件夹：清空所有内容
+                    for item in os.listdir(path):
+                        item_path = os.path.join(path, item)
+                        if os.path.isfile(item_path) or os.path.islink(item_path):
+                            os.remove(item_path)  # 删除文件或链接
+                        elif os.path.isdir(item_path):
+                            shutil.rmtree(item_path)  # 递归删除子文件夹
+                    # print(f"文件夹已存在，已清空内容: {path}")
+            else:
+                # 路径不存在，判断目标类型（根据最后一个元素是否有扩展名）
+                # 获取路径的最后一部分
+                last_part = os.path.basename(path)
+
+                # 判断是否为文件路径（包含扩展名）
+                if "." in last_part and not last_part.endswith("."):
+                    # 创建文件（包括父目录）
+                    parent_dir = os.path.dirname(path)
+                    if parent_dir and not os.path.exists(parent_dir):
+                        os.makedirs(parent_dir, exist_ok=True)
+                    with open(path, "w", encoding="utf-8") as f:
+                        pass  # 创建空文件
+                    # print(f"文件不存在，已创建: {path}")
+
+                else:
+                    # 创建文件夹（支持多级目录）
+                    os.makedirs(path, exist_ok=True)
+                    # print(f"文件夹不存在，已创建: {path}")
+
+        except PermissionError:
+            print(f"权限错误：无法操作路径 {path}")
+        except Exception as e:
+            print(f"处理路径时发生错误: {str(e)}")
+
+    def save_result(self, dir_path: str = "./") -> None:
+        """
+        判断文件夹是否存在：
+        - 不存在则创建
+        - 存在则清空文件夹内所有内容（保留文件夹本身）
+        """
+
+        for key, value in self.__statistical_data.items():
+            sub_dir = os.path.join(dir_path, key)
+            self.__check_path(sub_dir)
+
+            for sub_key, sub_value in value.items():
+                # print(f"{sub_key}: {len(value[sub_key])} - ({sub_value})")
+                try:
+                    with open(
+                        os.path.join(sub_dir, sub_key) + ".txt", "w", encoding="utf-8"
+                    ) as f:
+                        for op_name in sub_value:
+                            if not op_name.endswith("\n"):
+                                op_name += "\n"
+                            f.write(op_name)
+                    # print(f"内容已成功{'追加' if append else '写入'}到 {file_path}")
+                except Exception as e:
+                    print(f"写入文件失败: {e}")
+
+    def show_result(self) -> None:
+        test_counts = 0
+        for key, value in self.__statistical_data.items():
+            print(f"\n----------  {key}  ----------")
+            for sub_key, sub_value in value.items():
+                test_counts = test_counts + len(value[sub_key])
+                print(f"{sub_key}: {len(value[sub_key])}\n\t{sub_value}\n")
+        print(
+            f"\n******************* Total log num: {test_counts} *******************\n\n"
+        )
+
+    def run(self):
+        """
+        读取指定目录下符合命名规则的文件，并遍历每一行
+
+        参数:
+            search_path: 要搜索的根目录
+            pattern: 文件名匹配规则（支持通配符，如 '*.txt', 'file_*.log')
+        """
+        for dirpath, dirnames, filenames in os.walk(self.__search_path):
+            for filename in fnmatch.filter(filenames, self.__patten):
+                file_path = os.path.join(dirpath, filename)
+                # print(f"\n===== 正在处理文件: {file_path} =====")
+
+                cur_res_type = TestResult.FAILURE
+                cur_sub_type = "other"
+                pre_line = None
+                finish_early = False
+
+                try:
+                    with open(file_path, "r", encoding=self.__encoding) as f:
+                        for line in f:
+                            for sub_type, sub_type_params in self.__classify_data[
+                                cur_res_type.value
+                            ].items():
+                                for keyword in sub_type_params["rule"]:
+                                    if keyword in line:
+                                        cur_sub_type = sub_type
+                                        if sub_type == "missing":
+                                            finish_early = True
+                                        break
+
+                                if finish_early:
+                                    break
+
+                            pre_line = line
+                            if finish_early:
+                                break
+
+                        if "OK" in pre_line:
+                            cur_res_type = TestResult.OK
+                            cur_sub_type = None
+                            for sub_type, sub_type_params in self.__classify_data[
+                                cur_res_type.value
+                            ].items():
+                                for rule in sub_type_params["rule"]:
+                                    if rule in line:
+                                        cur_sub_type = sub_type
+
+                        op_name = filename.split(".")
+                        if cur_sub_type is None:
+                            self.__statistical_data[cur_res_type.value][
+                                "noskip"
+                            ].append(op_name[0])
+                        else:
+                            self.__statistical_data[cur_res_type.value][
+                                cur_sub_type
+                            ].append(op_name[0])
+                        # print(f"Result: {cur_res_type.value}, type: {cur_sub_type}")
+                    f.close()
+                except UnicodeDecodeError:
+                    print(f"警告: 文件 {file_path} 编码不是 utf-8,跳过处理")
+                except Exception as e:
+                    print(f"处理文件 {file_path} 时出错: {str(e)}")
+
+
+if __name__ == "__main__":
+
+    analyzer = LogAnalyzer(
+        classify_file="./classify.json",
+        search_path="./NPU_logs/20250918_065326",
+        pattern="test_*.log",
+    )
+
+    analyzer.run()
+    analyzer.show_result()
+    analyzer.save_result("./output")

From 465f54ac8a8ae5f89f045f87c7aaaed5088488ef Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Mon, 22 Sep 2025 17:29:30 +0800
Subject: [PATCH 5/9] [Metax] update metax backend CI test

---
 backends/metax_gpu/tests/CMakeLists.txt | 192 +++++++++++-------------
 backends/metax_gpu/tests/default.txt    |  67 +++++++++
 backends/metax_gpu/tests/run_test.sh    |  56 ++++++-
 3 files changed, 202 insertions(+), 113 deletions(-)
 create mode 100644 backends/metax_gpu/tests/default.txt

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 795a3c5b8ac..ded54233f24 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -11,117 +11,95 @@ set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test)
 
 file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py")
 
-list(
-  APPEND
-  PYTHON_TEST_SCRIPTS
-  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_where_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_split_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fill_constant_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_empty_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_sign_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_unbind_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_put_along_axis_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_maximum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_strided_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_set_value_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_flatten_contiguous_range_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_subtract_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_greater_equal_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_top_k_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_one_hot_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fill_any_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_reshape_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_bitwise_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_pad_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_zeros_like_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_shape_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_bincount_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_assign_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_fused_bias_act_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_adamw_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_nd_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_concat_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_nd_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_floordiv_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_mul_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_numel_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scale_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py)
-
-list(
-  REMOVE_ITEM
-  PYTHON_TEST_SCRIPTS
-  # 精度问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
-  # core.cudnnversion
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
-  # op_test.py 里 self._get_places()接口的适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
-  # device == "gpu" 适配问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
-  # paddle-gpu 报错一致
-  ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
-  # paddle.device.cuda.get_device_properties
-  ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
-  # needs check_grad with fp64 precision
-  ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
-  # CUDAPinnedPlace 问题
-  ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
-  ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+if(NOT TEST_LIST_FILE)
+  message(
+    STATUS
+      "<TEST_LIST_FILE> is not set, default test list [ ${CMAKE_CURRENT_LIST_DIR}/default.txt ] will be used."
+  )
+  file(STRINGS ${CMAKE_CURRENT_LIST_DIR}/default.txt TEST_PROGRAMS)
+
+else()
+  if(NOT EXISTS ${TEST_LIST_FILE})
+    message(FATAL_ERROR "<TEST_LIST_FILE> is not exist, please check it again.")
+  endif()
+
+  file(STRINGS ${TEST_LIST_FILE} TEST_PROGRAMS)
+
+  if(NOT TEST_PROGRAMS)
+    message(FATAL_ERROR "<TEST_LIST_FILE> is empty.")
+  endif()
+
+  set(PYTHON_TEST_SCRIPTS "")
+endif()
+
+foreach(test_name ${TEST_PROGRAMS})
+  set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py)
+  if(NOT EXISTS ${CURRENT_TEST_PROGRAM})
+    message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.")
+  else()
+    list(APPEND PYTHON_TEST_SCRIPTS ${CURRENT_TEST_PROGRAM})
+  endif()
+endforeach()
 
 list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS)
+
+if(NOT TEST_LIST_FILE)
+  list(
+    REMOVE_ITEM
+    PYTHON_TEST_SCRIPTS
+    # 精度问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py
+    # core.cudnnversion
+    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py
+    # op_test.py 里 self._get_places()接口的适配问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py
+    # device == "gpu" 适配问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py
+    # paddle-gpu 报错一致
+    ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py
+    # paddle.device.cuda.get_device_properties
+    ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py
+    # needs check_grad with fp64 precision
+    ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py
+    # CUDAPinnedPlace 问题
+    ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py
+    ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py)
+endif()
+
+if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR})
+  file(MAKE_DIRECTORY ${LOG_OUTPUT_DIR})
+  message(WARNING "${LOG_OUTPUT_DIR} is not exist, create it now.")
+endif()
+
 foreach(test_script ${PYTHON_TEST_SCRIPTS})
   get_filename_component(test_name ${test_script} NAME_WE)
 
-  add_test(
-    NAME "python_${test_name}"
-    COMMAND ${Python_EXECUTABLE} ${test_script}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  if(LOG_OUTPUT_DIR)
+    set(test_log_file "${LOG_OUTPUT_DIR}/${test_name}.log")
+
+    add_test(
+      NAME "python_${test_name}"
+      COMMAND sh -c
+              "${Python_EXECUTABLE} ${test_script} > ${test_log_file} 2>&1"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+
+  else()
+    add_test(
+      NAME "python_${test_name}"
+      COMMAND ${Python_EXECUTABLE} ${test_script}
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+
   set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360)
 endforeach()
diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
new file mode 100644
index 00000000000..8e2c3bcdd7e
--- /dev/null
+++ b/backends/metax_gpu/tests/default.txt
@@ -0,0 +1,67 @@
+test_accuracy_op
+test_tril_triu_op
+test_where_op
+test_split_op
+test_fill_constant_op
+test_empty_op
+test_sign_op
+test_cast_op
+test_index_add_op
+test_unbind_op
+test_put_along_axis_op
+test_layer_norm_op
+test_maximum_op
+test_accuracy_op
+test_strided_slice_op
+test_sum_op
+test_set_value_op
+test_flatten_contiguous_range_op
+test_top_k_op
+test_subtract_op
+test_softmax_op
+test_cumsum_op
+test_greater_equal_op
+test_elementwise_div_op
+test_top_k_v2_op
+test_stack_op
+test_one_hot_v2_op
+test_fill_any_op
+test_gather_op
+test_reshape_op
+test_index_put_op
+test_bitwise_op
+test_max_op
+test_pad_op
+test_elementwise_pow_op
+test_uniform_random_op
+test_scatter_op
+test_cast_op
+test_zeros_like_op
+test_compare_op
+test_shape_op
+test_tril_triu_op
+test_slice_op
+test_elementwise_add_op
+test_index_put_op
+test_bincount_op
+test_assign_op
+test_logical_op
+test_squared_l2_norm_op
+test_mean_op
+test_fused_bias_act_op
+test_expand_v2_op
+test_adamw_op
+test_gather_nd_op
+test_concat_op
+test_scatter_nd_op
+test_elementwise_floordiv_op
+test_elementwise_mul_op
+test_transpose_op
+test_einsum_op
+test_randint_op
+test_c_embedding_op
+test_numel_op
+test_scale_op
+test_softmax_with_cross_entropy_op
+test_full_op
+test_scatter_op
diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index 7d1e8e072a9..b9e8ec5b5cc 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -2,13 +2,13 @@
 #!/bin/bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,10 +29,54 @@ export
 rm -r build
 mkdir -p build && cd build
 
-cmake ..
 
+TEST_LOG_LEVEL=0
+TEST_LIST_FILE=""
+TEST_LOG_OUTPUT_DIR=""
+TEST_PARALLEL_NUM=10
 
-cmake --build .
+while getopts "i:o:v:j:h" opt; do
+  case "$opt" in
+    i)
+      TEST_LIST_FILE="$OPTARG"
+      ;;
+    o)
+      TEST_LOG_OUTPUT_DIR="$OPTARG"
+      echo "Set log output dir [ $TEST_LOG_OUTPUT_DIR ]"
+      ;;
+    v)
+      TEST_LOG_LEVEL=$OPTARG
+      ;;
+    j)
+      TEST_PARALLEL_NUM="$OPTARG"
+      ;;
+    h)
+      echo "用法：$0 -i <测试列表文件> -o <日志输出路径> ..."
+      echo "选项说明："
+      echo "  -i  测试程序列表文件"
+      echo "  -o  日志输出路径"
+      echo "  -v  GLOG_v 日志等级"
+      echo "  -j  ctest 测试并行数量"
+      echo "  -h  显示帮助"
+      exit 0
+      ;;
+    \?)
+      echo "error: unknow option '-$OPTARG'."
+      exit 1
+      ;;
+    :)
+      echo "error option '-$OPTARG' must have parameter."
+      exit 1
+      ;;
+  esac
+done
+
+
+export GLOG_v=$TEST_LOG_LEVEL
 
 
-ctest -j10 --output-on-failure
+cmake .. -DTEST_LIST_FILE=$TEST_LIST_FILE -DLOG_OUTPUT_DIR=$TEST_LOG_OUTPUT_DIR
+
+cmake --build .
+
+ctest -j$TEST_PARALLEL_NUM --output-on-failure

From 4f155310d76ac1b448e526cab281e4d1af9eb1ba Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Mon, 22 Sep 2025 18:02:57 +0800
Subject: [PATCH 6/9] [Metax] fix log_analysis.py bug

---
 .../metax_gpu/tests/scripts/log_analysis.py   | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py
index c0716f5b6f5..963d50751f7 100644
--- a/backends/metax_gpu/tests/scripts/log_analysis.py
+++ b/backends/metax_gpu/tests/scripts/log_analysis.py
@@ -153,7 +153,6 @@ def run(self):
 
                 cur_res_type = TestResult.FAILURE
                 cur_sub_type = "other"
-                pre_line = None
                 finish_early = False
 
                 try:
@@ -172,19 +171,19 @@ def run(self):
                                 if finish_early:
                                     break
 
-                            pre_line = line
                             if finish_early:
                                 break
 
-                        if "OK" in pre_line:
-                            cur_res_type = TestResult.OK
-                            cur_sub_type = None
-                            for sub_type, sub_type_params in self.__classify_data[
-                                cur_res_type.value
-                            ].items():
-                                for rule in sub_type_params["rule"]:
-                                    if rule in line:
-                                        cur_sub_type = sub_type
+                            if len(line) >= 2 and line[:2] == "OK":
+                                cur_res_type = TestResult.OK
+                                cur_sub_type = None
+                                for sub_type, sub_type_params in self.__classify_data[
+                                    cur_res_type.value
+                                ].items():
+                                    for rule in sub_type_params["rule"]:
+                                        if rule in line:
+                                            cur_sub_type = sub_type
+                                break
 
                         op_name = filename.split(".")
                         if cur_sub_type is None:

From cb53dd324451900ef58c95a36e3e85e8d15360f0 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Tue, 23 Sep 2025 11:06:41 +0800
Subject: [PATCH 7/9] [Metax] update metax CI CMakeLists & scripts

---
 .github/workflows/metax_work.yaml             |  2 +-
 backends/metax_gpu/tests/CMakeLists.txt       |  4 ++-
 backends/metax_gpu/tests/run_test.sh          |  2 +-
 .../metax_gpu/tests/scripts/classify.json     | 31 +++++++++++++++++--
 4 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml
index 51c0c62cef6..aff530d475c 100644
--- a/.github/workflows/metax_work.yaml
+++ b/.github/workflows/metax_work.yaml
@@ -51,4 +51,4 @@ jobs:
       - name: run test
         run: |
           cd backends/metax_gpu/tests
-          bash run_test.sh
+          bash run_test.sh -j 16
diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index ded54233f24..5b7be15e4f9 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -47,6 +47,8 @@ if(NOT TEST_LIST_FILE)
   list(
     REMOVE_ITEM
     PYTHON_TEST_SCRIPTS
+    # Metax unit test
+    ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py
     # 精度问题
     ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
     ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
@@ -101,5 +103,5 @@ foreach(test_script ${PYTHON_TEST_SCRIPTS})
       WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 
-  set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360)
+  set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 600)
 endforeach()
diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh
index b9e8ec5b5cc..7f2277fe4fb 100755
--- a/backends/metax_gpu/tests/run_test.sh
+++ b/backends/metax_gpu/tests/run_test.sh
@@ -33,7 +33,7 @@ mkdir -p build && cd build
 TEST_LOG_LEVEL=0
 TEST_LIST_FILE=""
 TEST_LOG_OUTPUT_DIR=""
-TEST_PARALLEL_NUM=10
+TEST_PARALLEL_NUM=1
 
 while getopts "i:o:v:j:h" opt; do
   case "$opt" in
diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json
index b97255adc3d..ca92ad4a0a4 100644
--- a/backends/metax_gpu/tests/scripts/classify.json
+++ b/backends/metax_gpu/tests/scripts/classify.json
@@ -7,13 +7,38 @@
 
     "FAILED":{
         "precision":{
-            "rule":["Mismatched elements"]
+            "rule":["Mismatched elements",
+            "RuntimeError: Jacobian mismatch for output 0 in y with respect to input 0 in x on Place(metax_gpu:0),",
+            "AssertionError: np.float64("]
         },
         "api":{
-            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"]
+            "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace",
+            "ValueError: The API paddle.device.cuda.get_device_properties",
+            "TypeError: paddle.index_add api",
+            "RuntimeError: (Unavailable) Paddle is not compiled with CUDA.",
+            "ValueError: invalid literal for int() with base",
+            "AttributeError: module 'paddle.base.libpaddle' has no attribute 'cudnn_version'",
+            "RuntimeError: Pinning memory is not supported for Place(metax_gpu:0)",
+            "PreconditionNotMetError: Context place error, excepted GPUPlace, but actually Place(metax_gpu:0).",
+            "AttributeError: module 'paddle.base.libpaddle.eager.ops.legacy' has no attribute 'fused_gemm_epilogue'",
+            "ValueError: The device should not be 'gpu', since PaddlePaddle is not compiled with CUDA"]
         },
         "missing":{
-            "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"]
+            "rule":["missing metax_gpu kernel",
+            "missing ONEDNN kernel",
+            "UnimplementedError: There are no kernels which are registered",
+            "symbol lookup error:",
+            "RuntimeError: (NotFound) The kernel"]
+        },
+        "core_dumped":{
+            "rule":["Segmentation fault"]
+        },
+        "input_dim":{
+            "rule":["ValueError: (InvalidArgument) The Input(",
+            "Test range of input is out of bound"]
+        },
+        "array_dim":{
+            "rule":["Arrays are not equal"]
         },
         "file_not_found":{
             "rule":["FileNotFoundError:"]

From 16fc6c75db07077d77818cfd2b7ac2e2b497a422 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Tue, 23 Sep 2025 17:02:14 +0800
Subject: [PATCH 8/9] [Metax] fix MatmulKernel problem

---
 .../kernels/impl/matmul_kernel_impl.h         | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
index bf228c81291..5221bd93ba9 100755
--- a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
+++ b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h
@@ -40,6 +40,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0
 #include "paddle/phi/kernels/autotune/auto_tune_base.h"
 #endif
+#include "paddle/phi/kernels/full_kernel.h"
 // clang-format on
 namespace phi {
 
@@ -1485,16 +1486,22 @@ void MatmulKernel(const Context& ctx,
                   bool transpose_x,
                   bool transpose_y,
                   DenseTensor* out) {
-  PADDLE_ENFORCE_NE(
+  if (x.numel() == 0 || y.numel() == 0) {
+    // input shape [1, 1, 5, 0], [1, 1, 0, 5], result shape is [1, 1, 5, 5]
+    phi::Full<T, Context>(
+        ctx, phi::IntArray(common::vectorize(out->dims())), 0, out);
+    return;
+  }
+  PADDLE_ENFORCE_GE(
       common::product(x.dims()),
       0,
-      phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
-  PADDLE_ENFORCE_NE(
+      common::errors::InvalidArgument(
+          "The dims of Input(X) should be greater than or equal to 0."));
+  PADDLE_ENFORCE_GE(
       common::product(y.dims()),
       0,
-      phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0,"
-                                   " but reviced dims size is 0. "));
+      common::errors::InvalidArgument(
+          "The dims of Input(Y) should be greater than or equal to 0."));
   const std::vector<std::int64_t> x_dims = common::vectorize(x.dims());
   const std::vector<std::int64_t> y_dims = common::vectorize(y.dims());
   MatmulJudgeDtypeKernel<Context, T>(

From 05d5b1cb6f9de6dc478b2b45c2f040d07ff34699 Mon Sep 17 00:00:00 2001
From: "Mingkun.Zhang" <2496808993@qq.com>
Date: Tue, 23 Sep 2025 18:41:34 +0800
Subject: [PATCH 9/9] [Metax] update metax CI program

---
 backends/metax_gpu/tests/CMakeLists.txt       |   2 +-
 backends/metax_gpu/tests/default.txt          | 258 ++++++++++++
 ...r_equal.py => test_greater_equal_metax.py} |   0
 ...ild_src_rank_and_local_expert_id_metax.py} |   0
 ...cubate_expand_modality_expert_id_metax.py} |   0
 ....py => test_incubate_moe_combine_metax.py} |   0
 ...e_dispatch_partial_nosoftmaxtopk_metax.py} |   0
 ..._moe_gate_dispatch_w_permute_bwd_metax.py} |   0
 ...bate_moe_gate_dispatch_w_permute_metax.py} |   0
 ...layer_norm.py => test_layer_norm_metax.py} |   0
 ...l_op__metax.py => test_matmul_op_metax.py} |   0
 ...mpling.py => test_top_p_sampling_metax.py} |   0
 .../tests/unittest/test_matmul_op__metax.py   | 395 ------------------
 13 files changed, 259 insertions(+), 396 deletions(-)
 rename backends/metax_gpu/tests/unit_test/{test_greater_equal.py => test_greater_equal_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_build_src_rank_and_local_expert_id.py => test_incubate_build_src_rank_and_local_expert_id_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_expand_modality_expert_id.py => test_incubate_expand_modality_expert_id_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_combine.py => test_incubate_moe_combine_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py => test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute_bwd.py => test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute.py => test_incubate_moe_gate_dispatch_w_permute_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_layer_norm.py => test_layer_norm_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_matmul_op__metax.py => test_matmul_op_metax.py} (100%)
 rename backends/metax_gpu/tests/unit_test/{test_top_p_sampling.py => test_top_p_sampling_metax.py} (100%)
 delete mode 100644 backends/metax_gpu/tests/unittest/test_matmul_op__metax.py

diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt
index 5b7be15e4f9..e8b11d347d9 100755
--- a/backends/metax_gpu/tests/CMakeLists.txt
+++ b/backends/metax_gpu/tests/CMakeLists.txt
@@ -48,7 +48,7 @@ if(NOT TEST_LIST_FILE)
     REMOVE_ITEM
     PYTHON_TEST_SCRIPTS
     # Metax unit test
-    ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py
+    ${METAX_UNIT_TEST_PATH}/test_matmul_op_metax.py
     # 精度问题
     ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py
     ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py
diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt
index 8e2c3bcdd7e..9f073d7e92f 100644
--- a/backends/metax_gpu/tests/default.txt
+++ b/backends/metax_gpu/tests/default.txt
@@ -65,3 +65,261 @@ test_scale_op
 test_softmax_with_cross_entropy_op
 test_full_op
 test_scatter_op
+test_assign_pos_op
+test_index_select_compatible
+test_dequantize_abs_max_op
+test_fill_any_op
+test_fractional_max_pool3d_api
+test_nll_loss
+test_is_empty_op
+test_norm_nn_grad
+test_index_fill
+test_floor
+test_slice_scatter
+test_nn_matmul_v2_grad
+test_matmul_op_with_head
+test_broadcast_shape
+test_fill_constant_op
+test_decayed_adagrad_op
+test_count_nonzero_api
+test_tensor_fill_
+test_minimum_op
+test_sigmoid_focal_loss
+test_dynamic_rnn_stop_gradient
+test_ops_roi_align
+test_split_op
+test_sum_decorator
+test_share_data_op
+test_assert_op
+test_masked_select_op
+test_tensor_fill_diagonal_tensor_
+test_unfold_op
+test_scatter_add_op
+test_flatten_contiguous_range_op
+test_empty_like_op
+test_logsumexp
+test_multiply
+test_ceil_op
+test_nearest_interp_v2_op
+test_incubate_expand_modality_expert_id
+test_bmm_op
+test_prelu_op
+test_batch_fc_op
+test_masked_fill
+test_overlap_add_op
+test_update_loss_scaling_op
+test_floor_divide_op
+test_increment
+test_complex_abs
+test_gather_compatible
+test_functional_conv2d
+test_group_norm_op_v2
+test_conv2d_transpose_op_depthwise_conv
+test_diagonal_op
+test_maximum_op
+test_erfinv_op
+test_interp_recompute_scale_factor
+test_embedding_scale_grad_by_freq
+test_diagonal_scatter
+test_higher_dim_scatter
+test_infer_shape
+test_flip
+test_fused_bias_dropout_residual_layer_norm_op
+test_greater_equal_op
+test_add_op
+test_cartesian_prod
+test_uniform_random_inplace_op
+test_feed_fetch_method
+test_pow_op
+test_conv3d_transpose_op
+test_add_position_encoding_op
+test_imperative_data_loader_base
+test_rnn_cell_api
+test_linspace
+test_adaptive_log_softmax_with_loss
+test_cross_entropy2_op
+test_complex_reshape
+test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk
+test_gaussian_nll_loss
+test_log_normal
+test_unstack_op
+test_expand_as_v2_op
+test_dequantize_log_op
+test_complex_sum_layer
+test_slice_var
+test_scale_op
+test_hinge_embedding_loss
+test_set_value_op
+test_merged_adam_op
+test_index_sample_op
+test_cuda_empty_cache
+test_add_n_op
+test_randint_like
+test_unique_consecutive_op
+test_fill_diagonal_tensor_op
+test_log_loss_op
+test_linalg_cholesky_inverse
+test_numel_op
+test_tril_triu_op
+test_adaptive_max_pool2d
+test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad
+test_complex_cast
+test_poisson_nll_loss
+test_empty_op
+test_functional_conv1d_transpose
+test_clip_by_norm_op
+test_box_clip_op
+test_clip_op
+test_grad_clip_minimize
+test_less_than_op
+test_adamw_op
+test_data_feeder
+test_top_p_sampling
+test_subtract_op
+test_batch_norm_op_v2
+test_cosine_embedding_loss
+test_imperative_data_parallel
+test_sigmoid
+test_adaptive_max_pool3d
+test_roll_op
+test_index_put_op
+test_assign_op
+test_amp_check_finite_and_scale_op
+test_strided_slice_op
+test_label_smooth_functional
+test_c_softmax_with_cross_entropy_op
+test_sync_batch_norm_op_convert
+test_tensor_fill_diagonal_tensor
+test_bfloat16_embedding
+test_gelu_op
+test_full_
+test_concat_op
+test_imperative_data_loader_process
+test_tensor_fill_diagonal_
+test_clip_grad_norm_
+test_eager_deletion_padding_rnn
+test_pool2d_api
+test_clip_grad_value_
+test_isfinite_v2_op
+test_nn_sigmoid_op
+test_adaptive_avg_pool2d
+test_size
+test_sigmoid_cross_entropy_with_logits_op
+test_scatter_reduce_op
+test_rsqrt
+test_conv2d_transpose_layer
+test_scatter_compatible
+test_scatter_nd_op
+test_add_op_fluid
+test_unique
+test_compat_split_static
+test_stack_op
+test_tile_op
+test_adam_optimizer_fp32_fp64
+test_batch_norm_op
+test_gather_nd_op
+test_pow
+test_executor_check_fetch_list
+test_inplace_softmax_with_cross_entropy
+test_cos
+test_imperative_parallel_coalesce_split
+test_grid_sample_function
+test_rnn_decode_api
+test_triu_indices_op
+test_binary_cross_entropy_with_logits_op
+test_mean_op_v1
+test_round_op
+test_assign_pos_op_dygraph
+test_nn_functional_embedding_static
+test_norm_op
+test_unbind_op
+test_bilinear_interp_v2_op
+test_tensor_data_ptr
+test_norm_all
+test_conv1d_transpose_layer
+test_arange
+test_compat_unfold
+test_fetch_var
+test_index_select_op
+test_sign_op
+test_functional_conv3d_transpose
+test_uniform_random_bf16_op
+test_gather_tree_op
+test_histogram_bin_edges_op
+test_fractional_max_pool2d_api
+test_fill_any_like_op
+test_alpha_dropout
+test_conv3d_layer
+test_compat_pad
+test_box_coder_op
+test_full_op
+test_repeat_interleave_op
+test_reshape_op
+test_embedding_renorm
+test_log_softmax
+test_pad3d_op
+test_diag_v2
+test_complex_transpose
+test_prior_box_op
+test_square_error_cost
+test_fused_rotary_position_embedding
+test_gru_rnn_op
+test_restrict_nonzero
+test_dygraph_weight_norm
+test_conv_transpose_nn_grad
+test_incubate_build_src_rank_and_local_expert_id
+test_elementwise_nn_grad
+test_fused_bias_dropout_residual_layer_norm_op_api
+test_simple_rnn_op
+test_data_generator
+test_compat_split
+test_scatter_add_inplace_op
+test_c_softmax_with_multi_label_cross_entropy_op
+test_conv3d_transpose_layer
+test_less_equal_op
+test_gumbel_softmax_op
+test_assign_value_op
+test_cast_op
+test_fused_bias_act_op
+test_conv3d_transpose_part2_op
+test_log
+test_data
+test_incubate_moe_combine
+test_masked_scatter
+test_silu_op
+test_select_scatter_op
+test_adagrad_op_v2
+test_functional_conv3d
+test_bce_with_logits_loss
+test_argsort_op
+test_layer_norm_op_v2
+test_adaptive_max_pool1d
+test_shard_index_op
+test_cuda_max_memory_allocated
+test_roi_align_op
+test_sin
+test_take
+test_take_along_dim
+test_complex_matmul
+test_reduce_as_op
+test_log_normal_inplace
+test_repeat
+test_fetch_lod_tensor_array
+test_partial_concat_op
+test_accuracy_op
+test_l1_norm_op
+test_bce_loss
+test_fused_conv2d_add_act_op
+test_tril_indices_op
+test_cross_entropy_op
+test_blha_get_max_len_op
+test_softmax_mask_fuse_op
+test_diag_embed
+test_one_hot_v2_op
+test_selu_op
+test_huber_loss_op
+test_einsum_op
+test_dygraph_spectral_norm
+test_block_diag
+test_index_elementwise
+test_matmul_out
diff --git a/backends/metax_gpu/tests/unit_test/test_greater_equal.py b/backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_greater_equal.py
rename to backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py
rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_layer_norm.py b/backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_layer_norm.py
rename to backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py b/backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py
rename to backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py
diff --git a/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py b/backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py
similarity index 100%
rename from backends/metax_gpu/tests/unit_test/test_top_p_sampling.py
rename to backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py
diff --git a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py b/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py
deleted file mode 100644
index 7545e16d14d..00000000000
--- a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py
+++ /dev/null
@@ -1,395 +0,0 @@
-#  Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import numpy as np
-import unittest
-from tests.op_test import OpTest
-import paddle
-
-paddle.enable_static()
-SEED = 2022
-
-
-def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0):
-    """Reference forward implementation using np.matmul."""
-    # np.matmul does not support the transpose flags, so we manually
-    # transpose X and Y appropriately.
-    if transpose_X:
-        if X.ndim == 1:
-            X = X.reshape((X.size,))
-        elif X.ndim == 2:
-            X = X.T
-        else:
-            dim = [i for i in range(len(X.shape))]
-            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
-            X = np.transpose(X, tuple(dim))
-    if transpose_Y:
-        if Y.ndim == 1:
-            Y = Y.reshape((Y.size,))
-        else:
-            dim = [i for i in range(len(Y.shape))]
-            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
-            Y = np.transpose(Y, tuple(dim))
-
-    Out = np.matmul(X, Y)
-    if abs(scale - 1.0) > 1e-09:
-        Out = Out * scale
-    return Out
-
-
-class TestBmmOp(OpTest):
-    """
-    case 0
-    """
-
-    def set_metax_gpu(self):
-        self.__class__.use_custom_device = True
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-
-    def config(self):
-        self.x_shape = (10, 2, 5)
-        self.y_shape = (10, 5, 8)
-
-    def init_kernel_type(self):
-        self.dtype = "float32"
-
-    def setUp(self):
-        self.set_metax_gpu()
-        self.init_kernel_type()
-        self.config()
-        self.op_type = "bmm"
-        x = np.random.random(self.x_shape).astype(self.dtype)
-        y = np.random.random(self.y_shape).astype(self.dtype)
-        # -0.1 ~ 0.1
-        x = -0.1 + 0.2 * x
-        y = -0.1 + 0.2 * y
-        result = reference_matmul(x, y)
-        result = result.astype(self.dtype)
-        self.inputs = {
-            "X": x,
-            "Y": y,
-        }
-        self.outputs = {"Out": result}
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestBmmOp1(TestBmmOp):
-    """
-    case 1
-    """
-
-    def config(self):
-        self.x_shape = (40, 10, 10)
-        self.y_shape = (40, 10, 10)
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestBmmOp2(TestBmmOp):
-    """
-    case 2
-    """
-
-    def config(self):
-        self.x_shape = (4, 10, 80)
-        self.y_shape = (4, 80, 1)
-
-    def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place,
-            ["X", "Y"],
-            "Out",
-            max_relative_error=1e-2,
-        )
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-3)
-
-
-class TestMatMulOp(OpTest):
-    """
-    basic case
-    """
-
-    def setUp(self):
-        self.set_metax_gpu()
-        self.op_type = "matmul_v2"
-        self.init_dtype()
-        self.init_alpha()
-        self.config()
-
-        X = np.random.random(self.x_shape).astype(self.dtype)
-        Y = np.random.random(self.y_shape).astype(self.dtype)
-        # -0.1 ~ 0.1
-        X = -0.1 + 0.2 * X
-        Y = -0.1 + 0.2 * Y
-        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, self.alpha)
-        Out = Out.astype(self.dtype)
-        self.inputs = {"X": X, "Y": Y}
-        self.attrs = {
-            "trans_x": self.transpose_X,
-            "trans_y": self.transpose_Y,
-            "alpha": self.alpha,
-        }
-        self.outputs = {"Out": Out}
-
-    def set_metax_gpu(self):
-        self.__class__.use_custom_device = True
-        self.place = paddle.CustomPlace("metax_gpu", 0)
-
-    def config(self):
-        self.x_shape = (100,)
-        self.y_shape = (100,)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-    def init_alpha(self):
-        self.alpha = 1.0
-
-    def init_dtype(self):
-        self.dtype = "float32"
-
-    def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-7)
-
-    def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ["X", "Y"], "Out")
-
-
-class TestMatMulOp1(TestMatMulOp):
-    """
-    case x_ndim == 1, y_ndim != 1
-    """
-
-    def config(self):
-        self.x_shape = (100,)
-        self.y_shape = (1, 3, 2, 100)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp2(TestMatMulOp):
-    """
-    case x_ndim != 1, y_ndim == 1
-    """
-
-    def config(self):
-        self.x_shape = (1, 2, 100, 1)
-        self.y_shape = (100,)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp3(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 100)
-        self.y_shape = (100, 2)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp4(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 100)
-        self.y_shape = (2, 100)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp5(TestMatMulOp):
-    """
-    case [M, K] x [K, N] = [M, N]
-    """
-
-    def config(self):
-        self.x_shape = (100, 2)
-        self.y_shape = (100, 2)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp6(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 2, 25)
-        self.y_shape = (25, 4)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp7(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (1, 4, 25)
-        self.y_shape = (4, 25)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp8(TestMatMulOp):
-    """
-    case [B, M, K] x [K, N] =  [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (1, 25, 4)
-        self.y_shape = (25, 4)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp9(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 5, 10)
-        self.y_shape = (2, 10, 5)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp10(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 10, 5)
-        self.y_shape = (2, 10, 5)
-        self.transpose_X = True
-        self.transpose_Y = False
-
-
-class TestMatMulOp11(TestMatMulOp):
-    """
-    case [B, M, K] x  [B, K, N] = [B, M, N]
-    """
-
-    def config(self):
-        self.x_shape = (2, 5, 10)
-        self.y_shape = (2, 5, 10)
-        self.transpose_X = False
-        self.transpose_Y = True
-
-
-class TestMatMulOp12(TestMatMulOp):
-    """
-    case to check the gradient for special case
-    """
-
-    def config(self):
-        self.x_shape = 100
-        self.y_shape = (1, 2, 2, 100, 2)
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-class TestMatMulOp13(TestMatMulOp):
-    """
-    case to check the gradient for special case
-    """
-
-    def config(self):
-        self.x_shape = (2, 1, 100)
-        self.y_shape = 100
-        self.transpose_X = False
-        self.transpose_Y = False
-
-
-# TODO(metax_gpu): alpha will be supported in next version
-# --------------------test matmul alpha--------------------
-# def create_test_alpha_class(parent):
-#     class TestMatMulOpAlphaCase(parent):
-#         def init_alpha(self):
-#             self.alpha = 0.125
-
-#     cls_name = "{0}_{1}".format(parent.__name__, "Alpha")
-#     TestMatMulOpAlphaCase.__name__ = cls_name
-#     globals()[cls_name] = TestMatMulOpAlphaCase
-
-# create_test_alpha_class(TestMatMulOp)
-# create_test_alpha_class(TestMatMulOp1)
-# create_test_alpha_class(TestMatMulOp2)
-# create_test_alpha_class(TestMatMulOp3)
-# create_test_alpha_class(TestMatMulOp4)
-# create_test_alpha_class(TestMatMulOp5)
-# create_test_alpha_class(TestMatMulOp6)
-# create_test_alpha_class(TestMatMulOp9)
-# create_test_alpha_class(TestMatMulOp10)
-# create_test_alpha_class(TestMatMulOp11)
-# create_test_alpha_class(TestMatMulOp12)
-# create_test_alpha_class(TestMatMulOp13)
-
-
-# --------------------test matmul fp16--------------------
-def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5):
-    class TestMatMulOpFp16Case(parent):
-        def init_kernel_type(self):
-            self.dtype = np.float16
-
-        def test_check_output(self):
-            self.check_output_with_place(self.place, atol=atol)
-
-        def test_check_grad(self):
-            self.check_grad_with_place(
-                self.place, ["X", "Y"], "Out", max_relative_error=max_relative_error
-            )
-
-    cls_name = "{0}_{1}".format(parent.__name__, "Fp16")
-    TestMatMulOpFp16Case.__name__ = cls_name
-    globals()[cls_name] = TestMatMulOpFp16Case
-
-
-create_test_fp16_class(TestMatMulOp)
-create_test_fp16_class(TestMatMulOp1)
-create_test_fp16_class(TestMatMulOp2)
-create_test_fp16_class(TestMatMulOp3)
-create_test_fp16_class(TestMatMulOp4)
-create_test_fp16_class(TestMatMulOp5)
-create_test_fp16_class(TestMatMulOp6)
-create_test_fp16_class(TestMatMulOp9)
-create_test_fp16_class(TestMatMulOp10)
-create_test_fp16_class(TestMatMulOp11)
-create_test_fp16_class(TestMatMulOp12)
-create_test_fp16_class(TestMatMulOp13)
-
-if __name__ == "__main__":
-    unittest.main()