From 762cf919abd7344c37773414c998186649142cb8 Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Tue, 9 Sep 2025 16:04:01 +0800 Subject: [PATCH 1/9] [Metax] fix dgc & mklml compile product path problem --- backends/metax_gpu/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 787aae13e40..b6f9abccf14 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -26,6 +26,10 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}") set(WITH_MKLML ON) +set(THIRD_PARTY_PATH + "${PADDLE_SOURCE_DIR}/build/third_party" + CACHE PATH "Third party libraries directory.") + include(paddle) set(THIRD_PARTY_PATH "${PADDLE_SOURCE_DIR}/build/third_party" From 9249b1ecd662bf6d266ef404834a245eec81b115 Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Thu, 11 Sep 2025 17:28:51 +0800 Subject: [PATCH 2/9] [Metax] update metax_gpu CMakeLists.txt --- backends/metax_gpu/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index b6f9abccf14..787aae13e40 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -26,10 +26,6 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}") set(WITH_MKLML ON) -set(THIRD_PARTY_PATH - "${PADDLE_SOURCE_DIR}/build/third_party" - CACHE PATH "Third party libraries directory.") - include(paddle) set(THIRD_PARTY_PATH "${PADDLE_SOURCE_DIR}/build/third_party" From 6634ef1611a035f00948245fe3aaaf5d79f97366 Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Tue, 16 Sep 2025 18:12:17 +0800 Subject: [PATCH 3/9] [Metax] organize documents --- .../calc_reduced_attn_kernel_register.cu | 2 +- backends/metax_gpu/kernels/funcs/softmax.cu | 2 +- .../kernels/funcs/values_vectors_functor.h | 2 +- .../metax_gpu/kernels/gpudnn/conv_cudnn_v7.h | 2 +- .../conv_transpose_grad_kernel_register.cu | 2 +- .../kernels/gpudnn/pool_kernel_register.cu | 2 +- .../metax_gpu/kernels/gpudnn/softmax_gpudnn.h | 2 +- .../kernels/impl/dirichlet_kernel_impl.h | 2 +- .../addmm_grad_kernel_register.cu | 0 .../addmm_kernel_register.cu | 0 .../batch_fc_grad_kernel_register.cu | 0 .../batch_norm_grad_kernel_register.cu | 2 +- .../batch_norm_kernel_register.cu | 0 .../bilinear_grad_kernel_register.cu | 0 .../bilinear_kernel_register.cu | 0 .../metax_kernel/blha_get_max_len_register.cu | 2 +- .../bmm_grad_kernel_register.cu | 0 .../bmm_kernel_register.cu | 0 ...abel_cross_entropy_grad_kernel_register.cu | 0 .../cholesky_grad_kernel_register.cu | 0 .../metax_kernel/cholesky_kernel_register.cu | 2 +- .../conv_kernel_register.cu | 0 .../conv_transpose_kernel_register.cu | 0 .../crop_kernel_register.cu | 0 .../cross_entropy_kernel_register.cu | 2 +- .../depthwise_conv_grad_kernel.cu | 0 .../depthwise_conv_kernel.cu | 0 .../kernels/{ => metax_kernel}/elementwise.h | 0 .../{ => metax_kernel}/flags_declare.cu | 0 .../flash_attn_grad_kernel.cu | 0 .../{ => metax_kernel}/flash_attn_kernel.cu | 0 .../{ => metax_kernel}/flash_attn_kernel.h | 0 .../{ => metax_kernel}/flash_attn_utils.h | 0 .../kernels/{ => metax_kernel}/flashattn.cc | 0 .../kernels/{ => metax_kernel}/flashattn.h | 0 .../flatten2_grad_kernel_register.cu | 0 .../flatten2_kernel_register.cu | 0 .../fused_conv2d_add_act_kernel_register.cu | 3 +- .../fused_rope_grad_kernel_register.cu | 0 .../fused_rope_kernel_register.cu | 0 .../instance_norm_grad_kerne_registerl.cu | 2 +- .../instance_norm_kernel_register.cu | 2 +- .../layer_norm_grad_kernel_register.cu | 0 .../layer_norm_kernel_register.cu | 0 .../lstm_kernel_register.cu | 0 .../metax_kernel/lu_kernel_register.cu | 2 +- .../lu_solve_grad_kernel_register.cu | 0 .../metax_kernel/matrix_rank_tol_kernel.cu | 2 +- .../{ => metax_kernel}/metax_context.cc | 24 +-- .../{ => metax_kernel}/metax_context.h | 6 +- .../multi_dot_grad_kernel_register.cu | 0 .../multi_dot_kernel_register.cu | 0 .../mv_grad_kernel_register.cu | 0 .../mv_kernel_register.cu | 0 .../metax_kernel/qr_kernel_register.cu | 2 +- .../rank_attention_grad_kernel_register.cu | 0 .../rank_attention_kernel_register.cu | 0 .../metax_kernel/rnn_grad_kernel.cu.cc | 2 +- .../kernels/metax_kernel/rnn_kernel.cu.cc | 2 +- .../slogdeterminant_kernel_register.cu | 0 .../softmax_kernel_grad_register.cu | 0 .../softmax_kernel_register.cu | 0 .../solve_grad_kernel_register.cu | 0 .../standard_gamma_kernel_register.cu | 0 .../stft_kernel_register.cu | 0 .../svd_kernel_register.cu | 0 .../top_k_grad_kernel_register.cu | 0 .../triangular_solve_grad_kernel_register.cu | 0 .../triangular_solve_kernel_register.cu | 0 .../warprnnt_kernel_register.cu | 0 .../weight_only_linear_kernel.cu | 0 .../weight_quantize_kernel_register.cu | 0 backends/metax_gpu/patch/paddle.patch | 204 +++++++++--------- backends/metax_gpu/tests/CMakeLists.txt | 54 ++--- 74 files changed, 166 insertions(+), 163 deletions(-) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_fc_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_norm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/cholesky_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_transpose_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/crop_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_grad_kernel.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_kernel.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/elementwise.h (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flags_declare.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_grad_kernel.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.h (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_utils.h (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.cc (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.h (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_conv2d_add_act_kernel_register.cu (99%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lstm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lu_solve_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.cc (90%) rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.h (96%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/slogdeterminant_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_grad_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/solve_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/standard_gamma_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/stft_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/svd_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/top_k_grad_kernel_register.cu (100%) mode change 100755 => 100644 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/warprnnt_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_only_linear_kernel.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_quantize_kernel_register.cu (100%) diff --git a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu index 11def2c9ee4..2aa8424f0b1 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/flash_attn_utils.h" +#include "kernels/metax_kernel/flash_attn_utils.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/calc_reduced_attn_kernel.h" diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu index d738a53f43a..44bfd02a308 100644 --- a/backends/metax_gpu/kernels/funcs/softmax.cu +++ b/backends/metax_gpu/kernels/funcs/softmax.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h index ec429950872..8c5996e680b 100644 --- a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h +++ b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h @@ -24,7 +24,7 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/common/errors.h" #endif -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" diff --git a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h index da61a1e5b41..a0f89047045 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h +++ b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "glog/logging.h" -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h" diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu index 0067818d165..b7eebfcee2e 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "kernels/gpudnn/conv_cudnn_v7.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/ddim.h" #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/dynload/cudnn.h" diff --git a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu index c115f5ad930..1c2bfeedf34 100644 --- a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "gpudnn/pool_gpudnn.h" -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/full_kernel.h" diff --git a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h index 168752700e9..5844886ad1b 100644 --- a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h +++ b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h @@ -25,7 +25,7 @@ #include "paddle/phi/kernels/primitive/kernel_primitives.h" // See Note [ Why still include the fluid headers? ] -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" diff --git a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h index 70af87513e5..c2e2e341bf5 100644 --- a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h @@ -17,7 +17,7 @@ #include #include -#include "kernels/elementwise.h" +#include "kernels/metax_kernel/elementwise.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu index 062646bbf9d..52fe5a1d566 100644 --- a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/flags.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu index bc9eb23c0e8..42810569fde 100644 --- a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/flash_attn_utils.h" #include "kernels/metax_kernel/block_attn.h" +#include "kernels/metax_kernel/flash_attn_utils.h" #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu index e8fae2d9da5..8a39ae3f0a8 100644 --- a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu @@ -20,7 +20,7 @@ limitations under the License. */ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu index e94862ec7b0..043a64dc149 100644 --- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/kernels/cross_entropy_kernel.h" #include "paddle/phi/kernels/full_kernel.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu diff --git a/backends/metax_gpu/kernels/elementwise.h b/backends/metax_gpu/kernels/metax_kernel/elementwise.h similarity index 100% rename from backends/metax_gpu/kernels/elementwise.h rename to backends/metax_gpu/kernels/metax_kernel/elementwise.h diff --git a/backends/metax_gpu/kernels/flags_declare.cu b/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu similarity index 100% rename from backends/metax_gpu/kernels/flags_declare.cu rename to backends/metax_gpu/kernels/metax_kernel/flags_declare.cu diff --git a/backends/metax_gpu/kernels/flash_attn_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_grad_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_kernel.h rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h diff --git a/backends/metax_gpu/kernels/flash_attn_utils.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_utils.h rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h diff --git a/backends/metax_gpu/kernels/flashattn.cc b/backends/metax_gpu/kernels/metax_kernel/flashattn.cc similarity index 100% rename from backends/metax_gpu/kernels/flashattn.cc rename to backends/metax_gpu/kernels/metax_kernel/flashattn.cc diff --git a/backends/metax_gpu/kernels/flashattn.h b/backends/metax_gpu/kernels/metax_kernel/flashattn.h similarity index 100% rename from backends/metax_gpu/kernels/flashattn.h rename to backends/metax_gpu/kernels/metax_kernel/flashattn.h diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu similarity index 99% rename from backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu index 48809ceefa4..c0d15b7f1b4 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu @@ -308,7 +308,8 @@ class CudnnConvDescManager { int groups, cudnnDataType_t dtype) { auto* desc = new phi::backends::gpu::ConvolutionDescriptor(); - desc->set(dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups); + desc->set( + dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups); return desc; } diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu index d7540d949a9..bdf341f5a35 100644 --- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu +++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu index db975d74665..e0c0ae9c1d6 100644 --- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/layer_norm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu index 5a2d85418a1..72e4c5b2b79 100644 --- a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu @@ -18,7 +18,7 @@ #include "paddle/phi/backends/dynload/cusolver.h" #endif -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/enforce.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu index bda5dc62f1a..d8c3355e6e4 100644 --- a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu +++ b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu @@ -18,7 +18,7 @@ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/type_traits.h" diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc similarity index 90% rename from backends/metax_gpu/kernels/metax_context.cc rename to backends/metax_gpu/kernels/metax_kernel/metax_context.cc index f0c92f00565..62aaa5fb2de 100644 --- a/backends/metax_gpu/kernels/metax_context.cc +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc @@ -12,27 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" namespace phi { const bool allow_tf32_cublas = []() -> bool { - const char* v = std::getenv("ALLOW_TF32_CUBLAS"); - if (v) { - return std::atoi(v); - } - return false; + const char* v = std::getenv("ALLOW_TF32_CUBLAS"); + if (v) { + return std::atoi(v); + } + return false; }(); const bool allow_tf32_cudnn = []() -> bool { - const char* v = std::getenv("ALLOW_TF32_CUDNN"); - if (v) { - return std::atoi(v); - } - return false; + const char* v = std::getenv("ALLOW_TF32_CUDNN"); + if (v) { + return std::atoi(v); + } + return false; }(); bool AllowTF32Cublas() { return allow_tf32_cublas; } -bool AllowTF32Cudnn() { return allow_tf32_cudnn; } +bool AllowTF32Cudnn() { return allow_tf32_cudnn; } void DnnWorkspaceHandle::RunFuncSync( const std::function& cudnn_func, diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h similarity index 96% rename from backends/metax_gpu/kernels/metax_context.h rename to backends/metax_gpu/kernels/metax_kernel/metax_context.h index 683a6df7017..a6610c1dab2 100644 --- a/backends/metax_gpu/kernels/metax_context.h +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ -#define BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ +#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ +#define BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ #include #include #include @@ -161,4 +161,4 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor, return DnnWorkspaceHandle(alloactor, stream); } } // namespace phi -#endif // BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ +#endif // BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu index 745069e2eda..c3041254444 100644 --- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu @@ -22,7 +22,7 @@ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/enforce.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc index 499832049e4..101b51aa350 100644 --- a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc +++ b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc @@ -14,7 +14,7 @@ #include "paddle/phi/kernels/rnn_grad_kernel.h" -#include "kernels/metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc index f1cf9e09dc7..2598ce093e6 100644 --- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc +++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc @@ -15,7 +15,7 @@ #include "paddle/phi/kernels/rnn_kernel.h" #include "glog/logging.h" -#include "kernels/metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/generator.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu old mode 100755 new mode 100644 similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 0283a443adb..e56826c4f3e 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644 - set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header}) + # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header}) endif() - + set(EIGEN_INCLUDE_DIR ${SOURCE_DIR}) diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc index bff0f2bf70..9376b5781f 100644 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc @@ -16,7 +16,7 @@ - + #include - + -#include "paddle/fluid/platform/enforce.h" +// #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/core/os_info.h" @@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644 + __macro(cudnnDestroyActivationDescriptor); \ + __macro(cudnnSetRNNDescriptor_v6); CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) - + #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \ @@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644 + __macro(cudnnRNNForwardInferenceEx); CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif - + @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif - + -#if CUDNN_VERSION < 90000 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \ - __macro(cudnnGetRNNParamsSize); \ @@ -132,15 +132,15 @@ index c0080f0a5e..458ca3e2e8 100644 -#endif } // namespace dynload } // namespace phi - + diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h index 1547909d92..66b2779392 100644 --- a/paddle/phi/backends/dynload/cufft.h +++ b/paddle/phi/backends/dynload/cufft.h @@ -1,3 +1,4 @@ -+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. ++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - + Licensed under the Apache License, Version 2.0 (the "License"); @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name); cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle(); \ @@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644 @@ -24,8 +24,8 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/dynamic_loader.h" #include "paddle/phi/common/port.h" - + -namespace phi { -namespace dynload { +// namespace phi { +// namespace dynload { - + extern std::once_flag cupti_dso_flag; extern void *cupti_dso_handle; @@ -71,7 +71,7 @@ extern void *cupti_dso_handle; CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); - + #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP -} // namespace dynload -} // namespace phi +// } // namespace dynload +// } // namespace phi - + -#endif // PADDLE_WITH_CUPTI +#endif // PADDLE_WITH_CUPTI \ No newline at end of file @@ -230,28 +230,28 @@ index 4ff2e528a9..81421c8ca1 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h @@ -1,3 +1,4 @@ -+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. ++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - + Licensed under the Apache License, Version 2.0 (the "License"); @@ -25,7 +26,7 @@ namespace phi { namespace backends { namespace gpu { - + -#define FULL_WARP_MASK 0xFFFFFFFF +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL #define CREATE_SHFL_MASK(mask, predicate) \ mask = __ballot_sync(FULL_WARP_MASK, (predicate)) - + @@ -45,12 +46,12 @@ namespace gpu { - + template __forceinline__ __device__ T -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) { return __shfl_down_sync(mask, val, static_cast(delta), width); } - + template -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask, @@ -259,7 +259,7 @@ index 4ff2e528a9..81421c8ca1 100644 int width = warpSize) { return __shfl_xor_sync(mask, val, width); @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, - + template <> __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( - unsigned mask, phi::dtype::float16 val, int delta, int width) { @@ -267,7 +267,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::float16(__shfl_down_sync( mask, val.to_half(), static_cast(delta), width)); } - + template <> __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( - unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { @@ -276,7 +276,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::bfloat16(__shfl_down_sync( mask, val.to_nv_bfloat16(), static_cast(delta), width)); @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { @@ -285,7 +285,7 @@ index 4ff2e528a9..81421c8ca1 100644 mask, static_cast(val.real), static_cast(delta), width)); float imag = static_cast(__shfl_down_sync( @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { @@ -294,14 +294,14 @@ index 4ff2e528a9..81421c8ca1 100644 static_cast(__shfl_down_sync(mask, static_cast(val.real), @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync( - unsigned mask, phi::dtype::float16 val, int width) { + unsigned long long mask, phi::dtype::float16 val, int width) { return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width)); } - + template <> __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( - unsigned mask, phi::dtype::bfloat16 val, int width) { @@ -310,7 +310,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::bfloat16( __shfl_xor_sync(mask, val.to_nv_bfloat16(), width)); @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { @@ -319,7 +319,7 @@ index 4ff2e528a9..81421c8ca1 100644 __shfl_xor_sync(mask, static_cast(val.real), width)); float imag = static_cast( @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { @@ -328,14 +328,14 @@ index 4ff2e528a9..81421c8ca1 100644 __shfl_xor_sync(mask, static_cast(val.real), width)); double imag = static_cast( @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - + template __forceinline__ __device__ T -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) { return __shfl_sync(mask, val, src_line, width); } - + @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) { // but most card's warp size is 32. const int warpSize = 32; @@ -343,7 +343,7 @@ index 4ff2e528a9..81421c8ca1 100644 - unsigned mask = 0u; + unsigned long long mask = 0ull; CREATE_SHFL_MASK(mask, tid < len); - + for (int offset = warpSize / 2; offset > 0; offset /= 2) diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 024a7de73e..1e4cdf16be 100644 @@ -351,7 +351,7 @@ index 024a7de73e..1e4cdf16be 100644 +++ b/paddle/phi/core/enforce.h @@ -45,7 +45,9 @@ limitations under the License. */ #endif - + #ifdef PADDLE_WITH_CUDA -#include "paddle/phi/backends/dynload/cublas.h" +// #include "paddle/phi/backends/dynload/../../../../../cublas.h" @@ -361,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644 #include "paddle/phi/backends/dynload/curand.h" #include "paddle/phi/backends/dynload/cusolver.h" @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; } - + void ThrowWarnInternal(const std::string& message); - + -#if defined(__CUDA_ARCH__) +#if defined(__CUDACC__) // For cuda, the assertions can affect performance and it is therefore @@ -379,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644 } while (0) #elif defined(__HIPCC__) @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) { - + } // namespace enforce using namespace enforce; // NOLINT -} // namespace phi @@ -392,7 +392,7 @@ index c646e487d0..325122175c 100644 @@ -25,8 +25,9 @@ #else #include - + -#include "paddle/phi/backends/dynload/cublas.h" -#include "paddle/phi/backends/dynload/cublasLt.h" +// #include "paddle/phi/backends/dynload/cublas.h" @@ -400,16 +400,16 @@ index c646e487d0..325122175c 100644 +// #include "paddle/phi/backends/dynload/cublasLt.h" #include "paddle/phi/backends/dynload/cudnn.h" #endif - + @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode, - + // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workaround. -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); - + #undef DECLARE_TYPE_FOR_GPU - + diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h index 2d02eb370b..8a7233e34e 100644 --- a/paddle/phi/core/platform/device_context.h @@ -430,58 +430,58 @@ index d69eb67d6f..1d8b6e9375 100644 --- a/paddle/phi/kernels/cpu/index_select_impl.h +++ b/paddle/phi/kernels/cpu/index_select_impl.h @@ -18,7 +18,7 @@ - + #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" - + diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu index cb35feee32..64f5bd24ac 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu +++ b/paddle/phi/kernels/funcs/fc_functor.cu @@ -16,12 +16,12 @@ limitations under the License. */ - + #include "paddle/phi/backends/all_context.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/fc_functor.h" - + #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" #include "paddle/phi/kernels/funcs/quant_dequant.h" #include "paddle/phi/kernels/matmul_kernel.h" - + diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu index 88663ec880..98b93072a3 100644 --- a/paddle/phi/kernels/funcs/gru_compute.cu +++ b/paddle/phi/kernels/funcs/gru_compute.cu @@ -12,7 +12,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/gru_compute.h" - + #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h" #include "paddle/phi/kernels/funcs/detail/gru_kernel.h" - + diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h index 15e1a4a3c3..e4780538d7 100644 --- a/paddle/phi/kernels/funcs/math/context_project.h +++ b/paddle/phi/kernels/funcs/math/context_project.h @@ -18,7 +18,7 @@ #include - + #include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/im2col.h" - + namespace phi { diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu index e101224970..a52eb6096f 100644 @@ -489,14 +489,14 @@ index e101224970..a52eb6096f 100644 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu @@ -15,11 +15,13 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + #include "paddle/phi/common/memory_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" - + namespace phi { namespace funcs { - + + + template @@ -514,19 +514,19 @@ index 558d363b39..05da04b517 100644 +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" - + diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu index 8b0baf5f5f..260482f124 100644 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu @@ -27,7 +27,7 @@ namespace cub = hipcub; - + #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h" - + -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_cuda_utils.h" - + namespace phi { diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h index e30d440ff3..3c74792690 100644 @@ -535,7 +535,7 @@ index e30d440ff3..3c74792690 100644 @@ -30,11 +30,11 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/primitive/functor_primitives.h" - + -#define FINAL_MASK 0xffffffff +#define FINAL_MASK 0xffffffffffffffffull #ifdef PADDLE_WITH_HIP @@ -545,7 +545,7 @@ index e30d440ff3..3c74792690 100644 +#define WARP_SIZE 64 #endif #define MAX_NUM_THREADS 1024 - + @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair topk[], for (int k = beam_size - 2; k >= 0; k--) { if (largest) { @@ -606,7 +606,7 @@ index e30d440ff3..3c74792690 100644 + topk[0 + offset].v = p.v; + topk[0 + offset].id = p.id; } - + template @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair topk[], template @@ -662,7 +662,7 @@ index e30d440ff3..3c74792690 100644 + // topk + MaxLength - *beam, src, tid, dim, *max, length, largest); } } - + @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], shared_max[wid] = input_now; } @@ -697,7 +697,7 @@ index e30d440ff3..3c74792690 100644 - if (--(*k) == 0) break; + // if (--(*k) == 0) break; + unsigned long long mask = 0ull; - + - unsigned mask = 0u; + // unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); @@ -721,14 +721,14 @@ index e30d440ff3..3c74792690 100644 + return ret; } - + static __device__ __forceinline__ unsigned int SetBitfield( unsigned int val, unsigned int to_insert, int pos, int len) { unsigned int ret; - asm("bfi.b32 %0, %1, %2, %3, %4;" - : "=r"(ret) - : "r"(to_insert), "r"(val), "r"(pos), "r"(len)); -+ ++ + ret = (static_cast(val) << (32 - pos - len)) >> (32 - len); return ret; } @@ -738,12 +738,12 @@ index e30d440ff3..3c74792690 100644 int len) { uint64_t ret; - asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len)); -+ ++ + + ret = (static_cast(val) << (64 - pos - len)) >> (64 - len); return ret; } - + @@ -507,9 +556,9 @@ struct Bitfield { int pos, int len) { @@ -751,7 +751,7 @@ index e30d440ff3..3c74792690 100644 - asm("bfi.b64 %0, %1, %2, %3, %4;" - : "=l"(ret) - : "l"(to_insert), "l"(val), "r"(pos), "r"(len)); -+ ++ + ret = (static_cast(val) << (64 - pos - len)) >> (64 - len); + return ret; @@ -763,7 +763,7 @@ index e30d440ff3..3c74792690 100644 int lane_id; - asm("mov.s32 %0, %%laneid;" : "=r"(lane_id)); - return lane_id; -+ ++ +// // >>>> PTX2CPP Success <<<< +// { +// (lane_id)=(threadIdx.x&(warpSize-1)); @@ -771,7 +771,7 @@ index e30d440ff3..3c74792690 100644 + return ::__lane_id(); + // return lane_id; } - + __device__ __forceinline__ unsigned GetLaneMaskLe() { unsigned mask; - asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); @@ -780,17 +780,17 @@ index e30d440ff3..3c74792690 100644 + return ((uint64_t(1) << ::__lane_id()) << 1) - 1; + // return mask; } - + template @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input, - + // 1. Find the k-th value T kth_value = static_cast(0); - RadixSearch::RadixType, IndexType, false>( + // RadixSearch::RadixType, IndexType, false>( + RadixSearch::RadixType, IndexType, false>( cur_input, k, num_cols, shared_mem, &kth_value); - + __shared__ int64_t block_min_idx; @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx, } @@ -803,12 +803,12 @@ index 32db61532f..0220316bc3 100644 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h @@ -15,7 +15,7 @@ #pragma once - + #if defined(PADDLE_WITH_CUDA) -#include "paddle/phi/backends/dynload/cublasLt.h" +// #include "paddle/phi/backends/dynload/cublasLt.h" #endif - + #include "glog/logging.h" diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h index 9d4bb18d55..ea42cc10a9 100644 @@ -830,12 +830,12 @@ index b8cfdbf3ce..fa14b94a77 100644 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu @@ -14,7 +14,7 @@ - + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" +#include "kernels/metax_kernel/mmha_util.cu.h" - + namespace phi { namespace fusion { diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu @@ -843,12 +843,12 @@ index e838778952..83e805e75a 100644 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu @@ -14,7 +14,7 @@ - + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" +#include "kernels/metax_kernel/mmha_util.cu.h" - + namespace phi { namespace fusion { diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -863,7 +863,7 @@ index f0cca0f701..02ea957240 100644 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" +#include "kernels/gpudnn/conv_gpudnn.h" +#include "kernels/impl/conv_cudnn_impl.h" - + namespace phi { // To determine use cudnn or not. diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h @@ -890,7 +890,7 @@ index 29fa252e96..4ae72b0935 100644 +// #endif return tanhf(x); } - + diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu index 11efd87965..679db14c24 100644 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu @@ -901,9 +901,9 @@ index 11efd87965..679db14c24 100644 #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" +#include "kernels/gpudnn/softmax_gpudnn.h" - + namespace phi { - + diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu index 63c35dd4ee..15da9aea45 100644 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu @@ -914,9 +914,9 @@ index 63c35dd4ee..15da9aea45 100644 #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" +#include "kernels/gpudnn/softmax_gpudnn.h" - + namespace phi { - + diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu index 1bdbe1564c..f753b54bc6 100644 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu @@ -948,7 +948,7 @@ index cf80666b4e..ca76e055fb 100644 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h @@ -19,7 +19,7 @@ limitations under the License. */ - + #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/baddbmm_grad_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" @@ -961,14 +961,14 @@ index 2789cb59a2..b91b076f7f 100644 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h @@ -20,7 +20,7 @@ limitations under the License. */ - + #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/baddbmm_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" - + diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h index 9a21c23666..86413d1577 100644 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h @@ -993,7 +993,7 @@ index 4459a931da..837c8682b8 100644 -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" - + namespace phi { diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h index ad9e9197dd..5478d9817d 100644 @@ -1013,27 +1013,27 @@ index e6b3960f6d..564125f1f6 100644 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) { - + template HOSTDEVICE T igamc(const T a, const T x) { - static T big = 4.503599627370496e15; - static T biginv = 2.22044604925031308085e-16; + const static T big = 4.503599627370496e15; + const static T biginv = 2.22044604925031308085e-16; - + if ((x <= T{0}) || (a <= T{0})) return (T{1.0}); - + diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h index 410fb3c560..009ce03440 100644 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) { - + template HOSTDEVICE T digamma(T x) { - static T pi = T{3.14159265358979323846}; + const static T pi = T{3.14159265358979323846}; - + if (x == T{0.0}) { T inf = std::numeric_limits::infinity(); diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h @@ -1048,12 +1048,12 @@ index 5ebbc8d2db..48acf8d0cd 100644 -#include "paddle/phi/kernels/funcs/quant_dequant.h" +#include "kernels/funcs/blas/cublaslt.h" +#include "kernels/funcs/quant_dequant.h" -+#include "kernels/metax_context.h" - ++#include "kernels/metax_kernel/metax_context.h" + #pragma once - + @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx, - + { auto helper = - std::make_unique(m, k, n, dev_ctx.cublaslt_handle()); @@ -1067,12 +1067,12 @@ index 1f319c4ae3..9186eb6906 100644 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once - + #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + namespace phi { diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h index 6f03f76eeb..5fe2c3e7dc 100644 @@ -1080,13 +1080,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once - + #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h index 7b85903776..3f4b298807 100644 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h @@ -1118,14 +1118,14 @@ index 4099d8b506..baef2cd643 100644 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h @@ -14,7 +14,7 @@ - + #pragma once - + -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" - + diff --git a/third_party/flagcx b/third_party/flagcx index 7c469f4af9..7e6c4cc3ca 160000 --- a/third_party/flagcx diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index 37475773026..410ef006514 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -87,32 +87,34 @@ list( list( REMOVE_ITEM PYTHON_TEST_SCRIPTS - ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py # 精度问题 - ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py # 受 test_sum_op.py 影响 - ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py # 精度问题 - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py # core.cudnnversion - # 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py # core.cudnnversion 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py # core.cudnnversion 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py # op_test.py 里 - # self._get_places() - # 接口适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py # device == "gpu" 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py # core.cudnnversion 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py # paddle.device.cuda.get_device_properties - ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py # needs check_grad with fp64 - # precision - ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # op_test.py 里 - # self._get_places() 接口适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py # CUDAPinnedPlace 问题 - ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py # paddle.device.cuda.get_device_properties - ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py # CUDAPinnedPlace 问题 - ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py # paddle.device.cuda.get_device_properties -) + # 精度问题 + ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py + # core.cudnnversion + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py + # op_test.py 里 self._get_places()接口适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py + # device == "gpu" 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py + # paddle-gpu 报错一致 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py + # paddle.device.cuda.get_device_properties + ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py + # needs check_grad with fp64 precision + ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py + # CUDAPinnedPlace 问题 + ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py) list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS) foreach(test_script ${PYTHON_TEST_SCRIPTS}) From c05f95f939c84359cc5bee3e3d88cb7235f55e83 Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Fri, 19 Sep 2025 14:57:09 +0800 Subject: [PATCH 4/9] [Metax] add log analysis script --- .../metax_gpu/tests/scripts/classify.json | 22 ++ .../metax_gpu/tests/scripts/log_analysis.py | 216 ++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 backends/metax_gpu/tests/scripts/classify.json create mode 100644 backends/metax_gpu/tests/scripts/log_analysis.py diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json new file mode 100644 index 00000000000..b97255adc3d --- /dev/null +++ b/backends/metax_gpu/tests/scripts/classify.json @@ -0,0 +1,22 @@ +{ + "OK":{ + "skipped":{ + "rule":["skipped="] + } + }, + + "FAILED":{ + "precision":{ + "rule":["Mismatched elements"] + }, + "api":{ + "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"] + }, + "missing":{ + "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"] + }, + "file_not_found":{ + "rule":["FileNotFoundError:"] + } + } +} diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py new file mode 100644 index 00000000000..c0716f5b6f5 --- /dev/null +++ b/backends/metax_gpu/tests/scripts/log_analysis.py @@ -0,0 +1,216 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import fnmatch +import shutil +from enum import Enum + + +class TestResult(Enum): + OK = "OK" + FAILURE = "FAILED" + + +class LogAnalyzer: + def __init__( + self, + classify_file: str, + search_path: str, + pattern: str = None, + encoding: str = "utf-8", + ): + self.__patten = pattern + self.__search_path = search_path + self.__encoding = encoding + self.__statistical_data = {} + + self.__classify_data = self.__read_json_file(classify_file) + for key, value in self.__classify_data.items(): + self.__statistical_data[key] = {} + for sub_key in list(value.keys()): + self.__statistical_data[key][sub_key] = [] + + self.__statistical_data[TestResult.OK.value]["noskip"] = [] + self.__statistical_data[TestResult.FAILURE.value]["other"] = [] + + def __read_json_file(self, path: str) -> dict: + with open(path, "r", encoding=self.__encoding) as f: + data = json.load(f) + f.close() + return data + + def __check_path(self, path: str) -> None: + """ + 处理指定路径: + - 若为文件夹路径:不存在则创建,存在则清空内容 + - 若为文件路径:不存在则创建,存在则清空内容 + """ + try: + # 判断路径是否存在 + if os.path.exists(path): + # 路径存在,判断是文件还是文件夹 + if os.path.isfile(path): + # 处理文件:清空内容 + with open(path, "w", encoding="utf-8") as f: + f.write("") # 写入空内容清空文件 + # print(f"文件已存在,已清空内容: {path}") + + elif os.path.isdir(path): + # 处理文件夹:清空所有内容 + for item in os.listdir(path): + item_path = os.path.join(path, item) + if os.path.isfile(item_path) or os.path.islink(item_path): + os.remove(item_path) # 删除文件或链接 + elif os.path.isdir(item_path): + shutil.rmtree(item_path) # 递归删除子文件夹 + # print(f"文件夹已存在,已清空内容: {path}") + else: + # 路径不存在,判断目标类型(根据最后一个元素是否有扩展名) + # 获取路径的最后一部分 + last_part = os.path.basename(path) + + # 判断是否为文件路径(包含扩展名) + if "." in last_part and not last_part.endswith("."): + # 创建文件(包括父目录) + parent_dir = os.path.dirname(path) + if parent_dir and not os.path.exists(parent_dir): + os.makedirs(parent_dir, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + pass # 创建空文件 + # print(f"文件不存在,已创建: {path}") + + else: + # 创建文件夹(支持多级目录) + os.makedirs(path, exist_ok=True) + # print(f"文件夹不存在,已创建: {path}") + + except PermissionError: + print(f"权限错误:无法操作路径 {path}") + except Exception as e: + print(f"处理路径时发生错误: {str(e)}") + + def save_result(self, dir_path: str = "./") -> None: + """ + 判断文件夹是否存在: + - 不存在则创建 + - 存在则清空文件夹内所有内容(保留文件夹本身) + """ + + for key, value in self.__statistical_data.items(): + sub_dir = os.path.join(dir_path, key) + self.__check_path(sub_dir) + + for sub_key, sub_value in value.items(): + # print(f"{sub_key}: {len(value[sub_key])} - ({sub_value})") + try: + with open( + os.path.join(sub_dir, sub_key) + ".txt", "w", encoding="utf-8" + ) as f: + for op_name in sub_value: + if not op_name.endswith("\n"): + op_name += "\n" + f.write(op_name) + # print(f"内容已成功{'追加' if append else '写入'}到 {file_path}") + except Exception as e: + print(f"写入文件失败: {e}") + + def show_result(self) -> None: + test_counts = 0 + for key, value in self.__statistical_data.items(): + print(f"\n---------- {key} ----------") + for sub_key, sub_value in value.items(): + test_counts = test_counts + len(value[sub_key]) + print(f"{sub_key}: {len(value[sub_key])}\n\t{sub_value}\n") + print( + f"\n******************* Total log num: {test_counts} *******************\n\n" + ) + + def run(self): + """ + 读取指定目录下符合命名规则的文件,并遍历每一行 + + 参数: + search_path: 要搜索的根目录 + pattern: 文件名匹配规则(支持通配符,如 '*.txt', 'file_*.log') + """ + for dirpath, dirnames, filenames in os.walk(self.__search_path): + for filename in fnmatch.filter(filenames, self.__patten): + file_path = os.path.join(dirpath, filename) + # print(f"\n===== 正在处理文件: {file_path} =====") + + cur_res_type = TestResult.FAILURE + cur_sub_type = "other" + pre_line = None + finish_early = False + + try: + with open(file_path, "r", encoding=self.__encoding) as f: + for line in f: + for sub_type, sub_type_params in self.__classify_data[ + cur_res_type.value + ].items(): + for keyword in sub_type_params["rule"]: + if keyword in line: + cur_sub_type = sub_type + if sub_type == "missing": + finish_early = True + break + + if finish_early: + break + + pre_line = line + if finish_early: + break + + if "OK" in pre_line: + cur_res_type = TestResult.OK + cur_sub_type = None + for sub_type, sub_type_params in self.__classify_data[ + cur_res_type.value + ].items(): + for rule in sub_type_params["rule"]: + if rule in line: + cur_sub_type = sub_type + + op_name = filename.split(".") + if cur_sub_type is None: + self.__statistical_data[cur_res_type.value][ + "noskip" + ].append(op_name[0]) + else: + self.__statistical_data[cur_res_type.value][ + cur_sub_type + ].append(op_name[0]) + # print(f"Result: {cur_res_type.value}, type: {cur_sub_type}") + f.close() + except UnicodeDecodeError: + print(f"警告: 文件 {file_path} 编码不是 utf-8,跳过处理") + except Exception as e: + print(f"处理文件 {file_path} 时出错: {str(e)}") + + +if __name__ == "__main__": + + analyzer = LogAnalyzer( + classify_file="./classify.json", + search_path="./NPU_logs/20250918_065326", + pattern="test_*.log", + ) + + analyzer.run() + analyzer.show_result() + analyzer.save_result("./output") From 465f54ac8a8ae5f89f045f87c7aaaed5088488ef Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Mon, 22 Sep 2025 17:29:30 +0800 Subject: [PATCH 5/9] [Metax] update metax backend CI test --- backends/metax_gpu/tests/CMakeLists.txt | 192 +++++++++++------------- backends/metax_gpu/tests/default.txt | 67 +++++++++ backends/metax_gpu/tests/run_test.sh | 56 ++++++- 3 files changed, 202 insertions(+), 113 deletions(-) create mode 100644 backends/metax_gpu/tests/default.txt diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index 795a3c5b8ac..ded54233f24 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -11,117 +11,95 @@ set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test) file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py") -list( - APPEND - PYTHON_TEST_SCRIPTS - ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_where_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_split_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_fill_constant_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_empty_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_sign_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_unbind_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_put_along_axis_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_maximum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_strided_slice_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_set_value_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_flatten_contiguous_range_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_top_k_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_subtract_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_greater_equal_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_top_k_v2_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_one_hot_v2_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_fill_any_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_reshape_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_bitwise_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_pad_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_zeros_like_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_shape_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_bincount_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_assign_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_fused_bias_act_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_adamw_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_gather_nd_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_concat_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scatter_nd_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_floordiv_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_mul_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_numel_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scale_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py) - -list( - REMOVE_ITEM - PYTHON_TEST_SCRIPTS - # 精度问题 - ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py - # core.cudnnversion - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py - # op_test.py 里 self._get_places()接口的适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py - # device == "gpu" 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py - # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py - # paddle.device.cuda.get_device_properties - ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py - # needs check_grad with fp64 precision - ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py - # CUDAPinnedPlace 问题 - ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py) +if(NOT TEST_LIST_FILE) + message( + STATUS + " is not set, default test list [ ${CMAKE_CURRENT_LIST_DIR}/default.txt ] will be used." + ) + file(STRINGS ${CMAKE_CURRENT_LIST_DIR}/default.txt TEST_PROGRAMS) + +else() + if(NOT EXISTS ${TEST_LIST_FILE}) + message(FATAL_ERROR " is not exist, please check it again.") + endif() + + file(STRINGS ${TEST_LIST_FILE} TEST_PROGRAMS) + + if(NOT TEST_PROGRAMS) + message(FATAL_ERROR " is empty.") + endif() + + set(PYTHON_TEST_SCRIPTS "") +endif() + +foreach(test_name ${TEST_PROGRAMS}) + set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py) + if(NOT EXISTS ${CURRENT_TEST_PROGRAM}) + message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.") + else() + list(APPEND PYTHON_TEST_SCRIPTS ${CURRENT_TEST_PROGRAM}) + endif() +endforeach() list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS) + +if(NOT TEST_LIST_FILE) + list( + REMOVE_ITEM + PYTHON_TEST_SCRIPTS + # 精度问题 + ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py + # core.cudnnversion + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py + # op_test.py 里 self._get_places()接口的适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py + # device == "gpu" 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py + # paddle-gpu 报错一致 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py + # paddle.device.cuda.get_device_properties + ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py + # needs check_grad with fp64 precision + ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py + # CUDAPinnedPlace 问题 + ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py) +endif() + +if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR}) + file(MAKE_DIRECTORY ${LOG_OUTPUT_DIR}) + message(WARNING "${LOG_OUTPUT_DIR} is not exist, create it now.") +endif() + foreach(test_script ${PYTHON_TEST_SCRIPTS}) get_filename_component(test_name ${test_script} NAME_WE) - add_test( - NAME "python_${test_name}" - COMMAND ${Python_EXECUTABLE} ${test_script} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + if(LOG_OUTPUT_DIR) + set(test_log_file "${LOG_OUTPUT_DIR}/${test_name}.log") + + add_test( + NAME "python_${test_name}" + COMMAND sh -c + "${Python_EXECUTABLE} ${test_script} > ${test_log_file} 2>&1" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + + else() + add_test( + NAME "python_${test_name}" + COMMAND ${Python_EXECUTABLE} ${test_script} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360) endforeach() diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt new file mode 100644 index 00000000000..8e2c3bcdd7e --- /dev/null +++ b/backends/metax_gpu/tests/default.txt @@ -0,0 +1,67 @@ +test_accuracy_op +test_tril_triu_op +test_where_op +test_split_op +test_fill_constant_op +test_empty_op +test_sign_op +test_cast_op +test_index_add_op +test_unbind_op +test_put_along_axis_op +test_layer_norm_op +test_maximum_op +test_accuracy_op +test_strided_slice_op +test_sum_op +test_set_value_op +test_flatten_contiguous_range_op +test_top_k_op +test_subtract_op +test_softmax_op +test_cumsum_op +test_greater_equal_op +test_elementwise_div_op +test_top_k_v2_op +test_stack_op +test_one_hot_v2_op +test_fill_any_op +test_gather_op +test_reshape_op +test_index_put_op +test_bitwise_op +test_max_op +test_pad_op +test_elementwise_pow_op +test_uniform_random_op +test_scatter_op +test_cast_op +test_zeros_like_op +test_compare_op +test_shape_op +test_tril_triu_op +test_slice_op +test_elementwise_add_op +test_index_put_op +test_bincount_op +test_assign_op +test_logical_op +test_squared_l2_norm_op +test_mean_op +test_fused_bias_act_op +test_expand_v2_op +test_adamw_op +test_gather_nd_op +test_concat_op +test_scatter_nd_op +test_elementwise_floordiv_op +test_elementwise_mul_op +test_transpose_op +test_einsum_op +test_randint_op +test_c_embedding_op +test_numel_op +test_scale_op +test_softmax_with_cross_entropy_op +test_full_op +test_scatter_op diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh index 7d1e8e072a9..b9e8ec5b5cc 100755 --- a/backends/metax_gpu/tests/run_test.sh +++ b/backends/metax_gpu/tests/run_test.sh @@ -2,13 +2,13 @@ #!/bin/bash # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,10 +29,54 @@ export rm -r build mkdir -p build && cd build -cmake .. +TEST_LOG_LEVEL=0 +TEST_LIST_FILE="" +TEST_LOG_OUTPUT_DIR="" +TEST_PARALLEL_NUM=10 -cmake --build . +while getopts "i:o:v:j:h" opt; do + case "$opt" in + i) + TEST_LIST_FILE="$OPTARG" + ;; + o) + TEST_LOG_OUTPUT_DIR="$OPTARG" + echo "Set log output dir [ $TEST_LOG_OUTPUT_DIR ]" + ;; + v) + TEST_LOG_LEVEL=$OPTARG + ;; + j) + TEST_PARALLEL_NUM="$OPTARG" + ;; + h) + echo "用法:$0 -i <测试列表文件> -o <日志输出路径> ..." + echo "选项说明:" + echo " -i 测试程序列表文件" + echo " -o 日志输出路径" + echo " -v GLOG_v 日志等级" + echo " -j ctest 测试并行数量" + echo " -h 显示帮助" + exit 0 + ;; + \?) + echo "error: unknow option '-$OPTARG'." + exit 1 + ;; + :) + echo "error option '-$OPTARG' must have parameter." + exit 1 + ;; + esac +done + + +export GLOG_v=$TEST_LOG_LEVEL -ctest -j10 --output-on-failure +cmake .. -DTEST_LIST_FILE=$TEST_LIST_FILE -DLOG_OUTPUT_DIR=$TEST_LOG_OUTPUT_DIR + +cmake --build . + +ctest -j$TEST_PARALLEL_NUM --output-on-failure From 4f155310d76ac1b448e526cab281e4d1af9eb1ba Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Mon, 22 Sep 2025 18:02:57 +0800 Subject: [PATCH 6/9] [Metax] fix log_analysis.py bug --- .../metax_gpu/tests/scripts/log_analysis.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py index c0716f5b6f5..963d50751f7 100644 --- a/backends/metax_gpu/tests/scripts/log_analysis.py +++ b/backends/metax_gpu/tests/scripts/log_analysis.py @@ -153,7 +153,6 @@ def run(self): cur_res_type = TestResult.FAILURE cur_sub_type = "other" - pre_line = None finish_early = False try: @@ -172,19 +171,19 @@ def run(self): if finish_early: break - pre_line = line if finish_early: break - if "OK" in pre_line: - cur_res_type = TestResult.OK - cur_sub_type = None - for sub_type, sub_type_params in self.__classify_data[ - cur_res_type.value - ].items(): - for rule in sub_type_params["rule"]: - if rule in line: - cur_sub_type = sub_type + if len(line) >= 2 and line[:2] == "OK": + cur_res_type = TestResult.OK + cur_sub_type = None + for sub_type, sub_type_params in self.__classify_data[ + cur_res_type.value + ].items(): + for rule in sub_type_params["rule"]: + if rule in line: + cur_sub_type = sub_type + break op_name = filename.split(".") if cur_sub_type is None: From cb53dd324451900ef58c95a36e3e85e8d15360f0 Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Tue, 23 Sep 2025 11:06:41 +0800 Subject: [PATCH 7/9] [Metax] update metax CI CMakeLists & scripts --- .github/workflows/metax_work.yaml | 2 +- backends/metax_gpu/tests/CMakeLists.txt | 4 ++- backends/metax_gpu/tests/run_test.sh | 2 +- .../metax_gpu/tests/scripts/classify.json | 31 +++++++++++++++++-- 4 files changed, 33 insertions(+), 6 deletions(-) diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml index 51c0c62cef6..aff530d475c 100644 --- a/.github/workflows/metax_work.yaml +++ b/.github/workflows/metax_work.yaml @@ -51,4 +51,4 @@ jobs: - name: run test run: | cd backends/metax_gpu/tests - bash run_test.sh + bash run_test.sh -j 16 diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index ded54233f24..5b7be15e4f9 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -47,6 +47,8 @@ if(NOT TEST_LIST_FILE) list( REMOVE_ITEM PYTHON_TEST_SCRIPTS + # Metax unit test + ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py # 精度问题 ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py @@ -101,5 +103,5 @@ foreach(test_script ${PYTHON_TEST_SCRIPTS}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() - set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360) + set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 600) endforeach() diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh index b9e8ec5b5cc..7f2277fe4fb 100755 --- a/backends/metax_gpu/tests/run_test.sh +++ b/backends/metax_gpu/tests/run_test.sh @@ -33,7 +33,7 @@ mkdir -p build && cd build TEST_LOG_LEVEL=0 TEST_LIST_FILE="" TEST_LOG_OUTPUT_DIR="" -TEST_PARALLEL_NUM=10 +TEST_PARALLEL_NUM=1 while getopts "i:o:v:j:h" opt; do case "$opt" in diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json index b97255adc3d..ca92ad4a0a4 100644 --- a/backends/metax_gpu/tests/scripts/classify.json +++ b/backends/metax_gpu/tests/scripts/classify.json @@ -7,13 +7,38 @@ "FAILED":{ "precision":{ - "rule":["Mismatched elements"] + "rule":["Mismatched elements", + "RuntimeError: Jacobian mismatch for output 0 in y with respect to input 0 in x on Place(metax_gpu:0),", + "AssertionError: np.float64("] }, "api":{ - "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"] + "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", + "ValueError: The API paddle.device.cuda.get_device_properties", + "TypeError: paddle.index_add api", + "RuntimeError: (Unavailable) Paddle is not compiled with CUDA.", + "ValueError: invalid literal for int() with base", + "AttributeError: module 'paddle.base.libpaddle' has no attribute 'cudnn_version'", + "RuntimeError: Pinning memory is not supported for Place(metax_gpu:0)", + "PreconditionNotMetError: Context place error, excepted GPUPlace, but actually Place(metax_gpu:0).", + "AttributeError: module 'paddle.base.libpaddle.eager.ops.legacy' has no attribute 'fused_gemm_epilogue'", + "ValueError: The device should not be 'gpu', since PaddlePaddle is not compiled with CUDA"] }, "missing":{ - "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"] + "rule":["missing metax_gpu kernel", + "missing ONEDNN kernel", + "UnimplementedError: There are no kernels which are registered", + "symbol lookup error:", + "RuntimeError: (NotFound) The kernel"] + }, + "core_dumped":{ + "rule":["Segmentation fault"] + }, + "input_dim":{ + "rule":["ValueError: (InvalidArgument) The Input(", + "Test range of input is out of bound"] + }, + "array_dim":{ + "rule":["Arrays are not equal"] }, "file_not_found":{ "rule":["FileNotFoundError:"] From 16fc6c75db07077d77818cfd2b7ac2e2b497a422 Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Tue, 23 Sep 2025 17:02:14 +0800 Subject: [PATCH 8/9] [Metax] fix MatmulKernel problem --- .../kernels/impl/matmul_kernel_impl.h | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h index bf228c81291..5221bd93ba9 100755 --- a/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/matmul_kernel_impl.h @@ -40,6 +40,7 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 && 0 #include "paddle/phi/kernels/autotune/auto_tune_base.h" #endif +#include "paddle/phi/kernels/full_kernel.h" // clang-format on namespace phi { @@ -1485,16 +1486,22 @@ void MatmulKernel(const Context& ctx, bool transpose_x, bool transpose_y, DenseTensor* out) { - PADDLE_ENFORCE_NE( + if (x.numel() == 0 || y.numel() == 0) { + // input shape [1, 1, 5, 0], [1, 1, 0, 5], result shape is [1, 1, 5, 5] + phi::Full( + ctx, phi::IntArray(common::vectorize(out->dims())), 0, out); + return; + } + PADDLE_ENFORCE_GE( common::product(x.dims()), 0, - phi::errors::InvalidArgument("The Input(X) dims size must not be equal 0," - " but reviced dims size is 0. ")); - PADDLE_ENFORCE_NE( + common::errors::InvalidArgument( + "The dims of Input(X) should be greater than or equal to 0.")); + PADDLE_ENFORCE_GE( common::product(y.dims()), 0, - phi::errors::InvalidArgument("The Input(Y) dims size must not be equal 0," - " but reviced dims size is 0. ")); + common::errors::InvalidArgument( + "The dims of Input(Y) should be greater than or equal to 0.")); const std::vector x_dims = common::vectorize(x.dims()); const std::vector y_dims = common::vectorize(y.dims()); MatmulJudgeDtypeKernel( From 05d5b1cb6f9de6dc478b2b45c2f040d07ff34699 Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Tue, 23 Sep 2025 18:41:34 +0800 Subject: [PATCH 9/9] [Metax] update metax CI program --- backends/metax_gpu/tests/CMakeLists.txt | 2 +- backends/metax_gpu/tests/default.txt | 258 ++++++++++++ ...r_equal.py => test_greater_equal_metax.py} | 0 ...ild_src_rank_and_local_expert_id_metax.py} | 0 ...cubate_expand_modality_expert_id_metax.py} | 0 ....py => test_incubate_moe_combine_metax.py} | 0 ...e_dispatch_partial_nosoftmaxtopk_metax.py} | 0 ..._moe_gate_dispatch_w_permute_bwd_metax.py} | 0 ...bate_moe_gate_dispatch_w_permute_metax.py} | 0 ...layer_norm.py => test_layer_norm_metax.py} | 0 ...l_op__metax.py => test_matmul_op_metax.py} | 0 ...mpling.py => test_top_p_sampling_metax.py} | 0 .../tests/unittest/test_matmul_op__metax.py | 395 ------------------ 13 files changed, 259 insertions(+), 396 deletions(-) rename backends/metax_gpu/tests/unit_test/{test_greater_equal.py => test_greater_equal_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_build_src_rank_and_local_expert_id.py => test_incubate_build_src_rank_and_local_expert_id_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_expand_modality_expert_id.py => test_incubate_expand_modality_expert_id_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_combine.py => test_incubate_moe_combine_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py => test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute_bwd.py => test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_incubate_moe_gate_dispatch_w_permute.py => test_incubate_moe_gate_dispatch_w_permute_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_layer_norm.py => test_layer_norm_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_matmul_op__metax.py => test_matmul_op_metax.py} (100%) rename backends/metax_gpu/tests/unit_test/{test_top_p_sampling.py => test_top_p_sampling_metax.py} (100%) delete mode 100644 backends/metax_gpu/tests/unittest/test_matmul_op__metax.py diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index 5b7be15e4f9..e8b11d347d9 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -48,7 +48,7 @@ if(NOT TEST_LIST_FILE) REMOVE_ITEM PYTHON_TEST_SCRIPTS # Metax unit test - ${METAX_UNIT_TEST_PATH}/test_matmul_op__metax.py + ${METAX_UNIT_TEST_PATH}/test_matmul_op_metax.py # 精度问题 ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt index 8e2c3bcdd7e..9f073d7e92f 100644 --- a/backends/metax_gpu/tests/default.txt +++ b/backends/metax_gpu/tests/default.txt @@ -65,3 +65,261 @@ test_scale_op test_softmax_with_cross_entropy_op test_full_op test_scatter_op +test_assign_pos_op +test_index_select_compatible +test_dequantize_abs_max_op +test_fill_any_op +test_fractional_max_pool3d_api +test_nll_loss +test_is_empty_op +test_norm_nn_grad +test_index_fill +test_floor +test_slice_scatter +test_nn_matmul_v2_grad +test_matmul_op_with_head +test_broadcast_shape +test_fill_constant_op +test_decayed_adagrad_op +test_count_nonzero_api +test_tensor_fill_ +test_minimum_op +test_sigmoid_focal_loss +test_dynamic_rnn_stop_gradient +test_ops_roi_align +test_split_op +test_sum_decorator +test_share_data_op +test_assert_op +test_masked_select_op +test_tensor_fill_diagonal_tensor_ +test_unfold_op +test_scatter_add_op +test_flatten_contiguous_range_op +test_empty_like_op +test_logsumexp +test_multiply +test_ceil_op +test_nearest_interp_v2_op +test_incubate_expand_modality_expert_id +test_bmm_op +test_prelu_op +test_batch_fc_op +test_masked_fill +test_overlap_add_op +test_update_loss_scaling_op +test_floor_divide_op +test_increment +test_complex_abs +test_gather_compatible +test_functional_conv2d +test_group_norm_op_v2 +test_conv2d_transpose_op_depthwise_conv +test_diagonal_op +test_maximum_op +test_erfinv_op +test_interp_recompute_scale_factor +test_embedding_scale_grad_by_freq +test_diagonal_scatter +test_higher_dim_scatter +test_infer_shape +test_flip +test_fused_bias_dropout_residual_layer_norm_op +test_greater_equal_op +test_add_op +test_cartesian_prod +test_uniform_random_inplace_op +test_feed_fetch_method +test_pow_op +test_conv3d_transpose_op +test_add_position_encoding_op +test_imperative_data_loader_base +test_rnn_cell_api +test_linspace +test_adaptive_log_softmax_with_loss +test_cross_entropy2_op +test_complex_reshape +test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk +test_gaussian_nll_loss +test_log_normal +test_unstack_op +test_expand_as_v2_op +test_dequantize_log_op +test_complex_sum_layer +test_slice_var +test_scale_op +test_hinge_embedding_loss +test_set_value_op +test_merged_adam_op +test_index_sample_op +test_cuda_empty_cache +test_add_n_op +test_randint_like +test_unique_consecutive_op +test_fill_diagonal_tensor_op +test_log_loss_op +test_linalg_cholesky_inverse +test_numel_op +test_tril_triu_op +test_adaptive_max_pool2d +test_sigmoid_cross_entropy_with_logits_grad_with_auto_grad +test_complex_cast +test_poisson_nll_loss +test_empty_op +test_functional_conv1d_transpose +test_clip_by_norm_op +test_box_clip_op +test_clip_op +test_grad_clip_minimize +test_less_than_op +test_adamw_op +test_data_feeder +test_top_p_sampling +test_subtract_op +test_batch_norm_op_v2 +test_cosine_embedding_loss +test_imperative_data_parallel +test_sigmoid +test_adaptive_max_pool3d +test_roll_op +test_index_put_op +test_assign_op +test_amp_check_finite_and_scale_op +test_strided_slice_op +test_label_smooth_functional +test_c_softmax_with_cross_entropy_op +test_sync_batch_norm_op_convert +test_tensor_fill_diagonal_tensor +test_bfloat16_embedding +test_gelu_op +test_full_ +test_concat_op +test_imperative_data_loader_process +test_tensor_fill_diagonal_ +test_clip_grad_norm_ +test_eager_deletion_padding_rnn +test_pool2d_api +test_clip_grad_value_ +test_isfinite_v2_op +test_nn_sigmoid_op +test_adaptive_avg_pool2d +test_size +test_sigmoid_cross_entropy_with_logits_op +test_scatter_reduce_op +test_rsqrt +test_conv2d_transpose_layer +test_scatter_compatible +test_scatter_nd_op +test_add_op_fluid +test_unique +test_compat_split_static +test_stack_op +test_tile_op +test_adam_optimizer_fp32_fp64 +test_batch_norm_op +test_gather_nd_op +test_pow +test_executor_check_fetch_list +test_inplace_softmax_with_cross_entropy +test_cos +test_imperative_parallel_coalesce_split +test_grid_sample_function +test_rnn_decode_api +test_triu_indices_op +test_binary_cross_entropy_with_logits_op +test_mean_op_v1 +test_round_op +test_assign_pos_op_dygraph +test_nn_functional_embedding_static +test_norm_op +test_unbind_op +test_bilinear_interp_v2_op +test_tensor_data_ptr +test_norm_all +test_conv1d_transpose_layer +test_arange +test_compat_unfold +test_fetch_var +test_index_select_op +test_sign_op +test_functional_conv3d_transpose +test_uniform_random_bf16_op +test_gather_tree_op +test_histogram_bin_edges_op +test_fractional_max_pool2d_api +test_fill_any_like_op +test_alpha_dropout +test_conv3d_layer +test_compat_pad +test_box_coder_op +test_full_op +test_repeat_interleave_op +test_reshape_op +test_embedding_renorm +test_log_softmax +test_pad3d_op +test_diag_v2 +test_complex_transpose +test_prior_box_op +test_square_error_cost +test_fused_rotary_position_embedding +test_gru_rnn_op +test_restrict_nonzero +test_dygraph_weight_norm +test_conv_transpose_nn_grad +test_incubate_build_src_rank_and_local_expert_id +test_elementwise_nn_grad +test_fused_bias_dropout_residual_layer_norm_op_api +test_simple_rnn_op +test_data_generator +test_compat_split +test_scatter_add_inplace_op +test_c_softmax_with_multi_label_cross_entropy_op +test_conv3d_transpose_layer +test_less_equal_op +test_gumbel_softmax_op +test_assign_value_op +test_cast_op +test_fused_bias_act_op +test_conv3d_transpose_part2_op +test_log +test_data +test_incubate_moe_combine +test_masked_scatter +test_silu_op +test_select_scatter_op +test_adagrad_op_v2 +test_functional_conv3d +test_bce_with_logits_loss +test_argsort_op +test_layer_norm_op_v2 +test_adaptive_max_pool1d +test_shard_index_op +test_cuda_max_memory_allocated +test_roi_align_op +test_sin +test_take +test_take_along_dim +test_complex_matmul +test_reduce_as_op +test_log_normal_inplace +test_repeat +test_fetch_lod_tensor_array +test_partial_concat_op +test_accuracy_op +test_l1_norm_op +test_bce_loss +test_fused_conv2d_add_act_op +test_tril_indices_op +test_cross_entropy_op +test_blha_get_max_len_op +test_softmax_mask_fuse_op +test_diag_embed +test_one_hot_v2_op +test_selu_op +test_huber_loss_op +test_einsum_op +test_dygraph_spectral_norm +test_block_diag +test_index_elementwise +test_matmul_out diff --git a/backends/metax_gpu/tests/unit_test/test_greater_equal.py b/backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_greater_equal.py rename to backends/metax_gpu/tests/unit_test/test_greater_equal_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py rename to backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py rename to backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_combine_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py rename to backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_layer_norm.py b/backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_layer_norm.py rename to backends/metax_gpu/tests/unit_test/test_layer_norm_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py b/backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py rename to backends/metax_gpu/tests/unit_test/test_matmul_op_metax.py diff --git a/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py b/backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py similarity index 100% rename from backends/metax_gpu/tests/unit_test/test_top_p_sampling.py rename to backends/metax_gpu/tests/unit_test/test_top_p_sampling_metax.py diff --git a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py b/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py deleted file mode 100644 index 7545e16d14d..00000000000 --- a/backends/metax_gpu/tests/unittest/test_matmul_op__metax.py +++ /dev/null @@ -1,395 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import unittest -from tests.op_test import OpTest -import paddle - -paddle.enable_static() -SEED = 2022 - - -def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0): - """Reference forward implementation using np.matmul.""" - # np.matmul does not support the transpose flags, so we manually - # transpose X and Y appropriately. - if transpose_X: - if X.ndim == 1: - X = X.reshape((X.size,)) - elif X.ndim == 2: - X = X.T - else: - dim = [i for i in range(len(X.shape))] - dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] - X = np.transpose(X, tuple(dim)) - if transpose_Y: - if Y.ndim == 1: - Y = Y.reshape((Y.size,)) - else: - dim = [i for i in range(len(Y.shape))] - dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] - Y = np.transpose(Y, tuple(dim)) - - Out = np.matmul(X, Y) - if abs(scale - 1.0) > 1e-09: - Out = Out * scale - return Out - - -class TestBmmOp(OpTest): - """ - case 0 - """ - - def set_metax_gpu(self): - self.__class__.use_custom_device = True - self.place = paddle.CustomPlace("metax_gpu", 0) - - def config(self): - self.x_shape = (10, 2, 5) - self.y_shape = (10, 5, 8) - - def init_kernel_type(self): - self.dtype = "float32" - - def setUp(self): - self.set_metax_gpu() - self.init_kernel_type() - self.config() - self.op_type = "bmm" - x = np.random.random(self.x_shape).astype(self.dtype) - y = np.random.random(self.y_shape).astype(self.dtype) - # -0.1 ~ 0.1 - x = -0.1 + 0.2 * x - y = -0.1 + 0.2 * y - result = reference_matmul(x, y) - result = result.astype(self.dtype) - self.inputs = { - "X": x, - "Y": y, - } - self.outputs = {"Out": result} - - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-3) - - def test_check_grad(self): - self.check_grad_with_place(self.place, ["X", "Y"], "Out") - - -class TestBmmOp1(TestBmmOp): - """ - case 1 - """ - - def config(self): - self.x_shape = (40, 10, 10) - self.y_shape = (40, 10, 10) - - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-3) - - def test_check_grad(self): - self.check_grad_with_place(self.place, ["X", "Y"], "Out") - - -class TestBmmOp2(TestBmmOp): - """ - case 2 - """ - - def config(self): - self.x_shape = (4, 10, 80) - self.y_shape = (4, 80, 1) - - def test_check_grad(self): - self.check_grad_with_place( - self.place, - ["X", "Y"], - "Out", - max_relative_error=1e-2, - ) - - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-3) - - -class TestMatMulOp(OpTest): - """ - basic case - """ - - def setUp(self): - self.set_metax_gpu() - self.op_type = "matmul_v2" - self.init_dtype() - self.init_alpha() - self.config() - - X = np.random.random(self.x_shape).astype(self.dtype) - Y = np.random.random(self.y_shape).astype(self.dtype) - # -0.1 ~ 0.1 - X = -0.1 + 0.2 * X - Y = -0.1 + 0.2 * Y - Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, self.alpha) - Out = Out.astype(self.dtype) - self.inputs = {"X": X, "Y": Y} - self.attrs = { - "trans_x": self.transpose_X, - "trans_y": self.transpose_Y, - "alpha": self.alpha, - } - self.outputs = {"Out": Out} - - def set_metax_gpu(self): - self.__class__.use_custom_device = True - self.place = paddle.CustomPlace("metax_gpu", 0) - - def config(self): - self.x_shape = (100,) - self.y_shape = (100,) - self.transpose_X = False - self.transpose_Y = False - - def init_alpha(self): - self.alpha = 1.0 - - def init_dtype(self): - self.dtype = "float32" - - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-7) - - def test_check_grad_normal(self): - self.check_grad_with_place(self.place, ["X", "Y"], "Out") - - -class TestMatMulOp1(TestMatMulOp): - """ - case x_ndim == 1, y_ndim != 1 - """ - - def config(self): - self.x_shape = (100,) - self.y_shape = (1, 3, 2, 100) - self.transpose_X = False - self.transpose_Y = True - - -class TestMatMulOp2(TestMatMulOp): - """ - case x_ndim != 1, y_ndim == 1 - """ - - def config(self): - self.x_shape = (1, 2, 100, 1) - self.y_shape = (100,) - self.transpose_X = True - self.transpose_Y = False - - -class TestMatMulOp3(TestMatMulOp): - """ - case [M, K] x [K, N] = [M, N] - """ - - def config(self): - self.x_shape = (2, 100) - self.y_shape = (100, 2) - self.transpose_X = False - self.transpose_Y = False - - -class TestMatMulOp4(TestMatMulOp): - """ - case [M, K] x [K, N] = [M, N] - """ - - def config(self): - self.x_shape = (2, 100) - self.y_shape = (2, 100) - self.transpose_X = False - self.transpose_Y = True - - -class TestMatMulOp5(TestMatMulOp): - """ - case [M, K] x [K, N] = [M, N] - """ - - def config(self): - self.x_shape = (100, 2) - self.y_shape = (100, 2) - self.transpose_X = True - self.transpose_Y = False - - -class TestMatMulOp6(TestMatMulOp): - """ - case [B, M, K] x [K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (2, 2, 25) - self.y_shape = (25, 4) - self.transpose_X = False - self.transpose_Y = False - - -class TestMatMulOp7(TestMatMulOp): - """ - case [B, M, K] x [K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (1, 4, 25) - self.y_shape = (4, 25) - self.transpose_X = False - self.transpose_Y = True - - -class TestMatMulOp8(TestMatMulOp): - """ - case [B, M, K] x [K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (1, 25, 4) - self.y_shape = (25, 4) - self.transpose_X = True - self.transpose_Y = False - - -class TestMatMulOp9(TestMatMulOp): - """ - case [B, M, K] x [B, K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (2, 5, 10) - self.y_shape = (2, 10, 5) - self.transpose_X = False - self.transpose_Y = False - - -class TestMatMulOp10(TestMatMulOp): - """ - case [B, M, K] x [B, K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (2, 10, 5) - self.y_shape = (2, 10, 5) - self.transpose_X = True - self.transpose_Y = False - - -class TestMatMulOp11(TestMatMulOp): - """ - case [B, M, K] x [B, K, N] = [B, M, N] - """ - - def config(self): - self.x_shape = (2, 5, 10) - self.y_shape = (2, 5, 10) - self.transpose_X = False - self.transpose_Y = True - - -class TestMatMulOp12(TestMatMulOp): - """ - case to check the gradient for special case - """ - - def config(self): - self.x_shape = 100 - self.y_shape = (1, 2, 2, 100, 2) - self.transpose_X = False - self.transpose_Y = False - - -class TestMatMulOp13(TestMatMulOp): - """ - case to check the gradient for special case - """ - - def config(self): - self.x_shape = (2, 1, 100) - self.y_shape = 100 - self.transpose_X = False - self.transpose_Y = False - - -# TODO(metax_gpu): alpha will be supported in next version -# --------------------test matmul alpha-------------------- -# def create_test_alpha_class(parent): -# class TestMatMulOpAlphaCase(parent): -# def init_alpha(self): -# self.alpha = 0.125 - -# cls_name = "{0}_{1}".format(parent.__name__, "Alpha") -# TestMatMulOpAlphaCase.__name__ = cls_name -# globals()[cls_name] = TestMatMulOpAlphaCase - -# create_test_alpha_class(TestMatMulOp) -# create_test_alpha_class(TestMatMulOp1) -# create_test_alpha_class(TestMatMulOp2) -# create_test_alpha_class(TestMatMulOp3) -# create_test_alpha_class(TestMatMulOp4) -# create_test_alpha_class(TestMatMulOp5) -# create_test_alpha_class(TestMatMulOp6) -# create_test_alpha_class(TestMatMulOp9) -# create_test_alpha_class(TestMatMulOp10) -# create_test_alpha_class(TestMatMulOp11) -# create_test_alpha_class(TestMatMulOp12) -# create_test_alpha_class(TestMatMulOp13) - - -# --------------------test matmul fp16-------------------- -def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5): - class TestMatMulOpFp16Case(parent): - def init_kernel_type(self): - self.dtype = np.float16 - - def test_check_output(self): - self.check_output_with_place(self.place, atol=atol) - - def test_check_grad(self): - self.check_grad_with_place( - self.place, ["X", "Y"], "Out", max_relative_error=max_relative_error - ) - - cls_name = "{0}_{1}".format(parent.__name__, "Fp16") - TestMatMulOpFp16Case.__name__ = cls_name - globals()[cls_name] = TestMatMulOpFp16Case - - -create_test_fp16_class(TestMatMulOp) -create_test_fp16_class(TestMatMulOp1) -create_test_fp16_class(TestMatMulOp2) -create_test_fp16_class(TestMatMulOp3) -create_test_fp16_class(TestMatMulOp4) -create_test_fp16_class(TestMatMulOp5) -create_test_fp16_class(TestMatMulOp6) -create_test_fp16_class(TestMatMulOp9) -create_test_fp16_class(TestMatMulOp10) -create_test_fp16_class(TestMatMulOp11) -create_test_fp16_class(TestMatMulOp12) -create_test_fp16_class(TestMatMulOp13) - -if __name__ == "__main__": - unittest.main()