From 762cf919abd7344c37773414c998186649142cb8 Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Tue, 9 Sep 2025 16:04:01 +0800 Subject: [PATCH 1/5] [Metax] fix dgc & mklml compile product path problem --- backends/metax_gpu/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 787aae13e40..b6f9abccf14 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -26,6 +26,10 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}") set(WITH_MKLML ON) +set(THIRD_PARTY_PATH + "${PADDLE_SOURCE_DIR}/build/third_party" + CACHE PATH "Third party libraries directory.") + include(paddle) set(THIRD_PARTY_PATH "${PADDLE_SOURCE_DIR}/build/third_party" From 9249b1ecd662bf6d266ef404834a245eec81b115 Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Thu, 11 Sep 2025 17:28:51 +0800 Subject: [PATCH 2/5] [Metax] update metax_gpu CMakeLists.txt --- backends/metax_gpu/CMakeLists.txt | 4 ---- 1 file changed, 4 deletions(-) diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index b6f9abccf14..787aae13e40 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -26,10 +26,6 @@ set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake") message(STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}") set(WITH_MKLML ON) -set(THIRD_PARTY_PATH - "${PADDLE_SOURCE_DIR}/build/third_party" - CACHE PATH "Third party libraries directory.") - include(paddle) set(THIRD_PARTY_PATH "${PADDLE_SOURCE_DIR}/build/third_party" From 6634ef1611a035f00948245fe3aaaf5d79f97366 Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Tue, 16 Sep 2025 18:12:17 +0800 Subject: [PATCH 3/5] [Metax] organize documents --- .../calc_reduced_attn_kernel_register.cu | 2 +- backends/metax_gpu/kernels/funcs/softmax.cu | 2 +- .../kernels/funcs/values_vectors_functor.h | 2 +- .../metax_gpu/kernels/gpudnn/conv_cudnn_v7.h | 2 +- .../conv_transpose_grad_kernel_register.cu | 2 +- .../kernels/gpudnn/pool_kernel_register.cu | 2 +- .../metax_gpu/kernels/gpudnn/softmax_gpudnn.h | 2 +- .../kernels/impl/dirichlet_kernel_impl.h | 2 +- .../addmm_grad_kernel_register.cu | 0 .../addmm_kernel_register.cu | 0 .../batch_fc_grad_kernel_register.cu | 0 .../batch_norm_grad_kernel_register.cu | 2 +- .../batch_norm_kernel_register.cu | 0 .../bilinear_grad_kernel_register.cu | 0 .../bilinear_kernel_register.cu | 0 .../metax_kernel/blha_get_max_len_register.cu | 2 +- .../bmm_grad_kernel_register.cu | 0 .../bmm_kernel_register.cu | 0 ...abel_cross_entropy_grad_kernel_register.cu | 0 .../cholesky_grad_kernel_register.cu | 0 .../metax_kernel/cholesky_kernel_register.cu | 2 +- .../conv_kernel_register.cu | 0 .../conv_transpose_kernel_register.cu | 0 .../crop_kernel_register.cu | 0 .../cross_entropy_kernel_register.cu | 2 +- .../depthwise_conv_grad_kernel.cu | 0 .../depthwise_conv_kernel.cu | 0 .../kernels/{ => metax_kernel}/elementwise.h | 0 .../{ => metax_kernel}/flags_declare.cu | 0 .../flash_attn_grad_kernel.cu | 0 .../{ => metax_kernel}/flash_attn_kernel.cu | 0 .../{ => metax_kernel}/flash_attn_kernel.h | 0 .../{ => metax_kernel}/flash_attn_utils.h | 0 .../kernels/{ => metax_kernel}/flashattn.cc | 0 .../kernels/{ => metax_kernel}/flashattn.h | 0 .../flatten2_grad_kernel_register.cu | 0 .../flatten2_kernel_register.cu | 0 .../fused_conv2d_add_act_kernel_register.cu | 3 +- .../fused_rope_grad_kernel_register.cu | 0 .../fused_rope_kernel_register.cu | 0 .../instance_norm_grad_kerne_registerl.cu | 2 +- .../instance_norm_kernel_register.cu | 2 +- .../layer_norm_grad_kernel_register.cu | 0 .../layer_norm_kernel_register.cu | 0 .../lstm_kernel_register.cu | 0 .../metax_kernel/lu_kernel_register.cu | 2 +- .../lu_solve_grad_kernel_register.cu | 0 .../metax_kernel/matrix_rank_tol_kernel.cu | 2 +- .../{ => metax_kernel}/metax_context.cc | 24 +-- .../{ => metax_kernel}/metax_context.h | 6 +- .../multi_dot_grad_kernel_register.cu | 0 .../multi_dot_kernel_register.cu | 0 .../mv_grad_kernel_register.cu | 0 .../mv_kernel_register.cu | 0 .../metax_kernel/qr_kernel_register.cu | 2 +- .../rank_attention_grad_kernel_register.cu | 0 .../rank_attention_kernel_register.cu | 0 .../metax_kernel/rnn_grad_kernel.cu.cc | 2 +- .../kernels/metax_kernel/rnn_kernel.cu.cc | 2 +- .../slogdeterminant_kernel_register.cu | 0 .../softmax_kernel_grad_register.cu | 0 .../softmax_kernel_register.cu | 0 .../solve_grad_kernel_register.cu | 0 .../standard_gamma_kernel_register.cu | 0 .../stft_kernel_register.cu | 0 .../svd_kernel_register.cu | 0 .../top_k_grad_kernel_register.cu | 0 .../triangular_solve_grad_kernel_register.cu | 0 .../triangular_solve_kernel_register.cu | 0 .../warprnnt_kernel_register.cu | 0 .../weight_only_linear_kernel.cu | 0 .../weight_quantize_kernel_register.cu | 0 backends/metax_gpu/patch/paddle.patch | 204 +++++++++--------- backends/metax_gpu/tests/CMakeLists.txt | 54 ++--- 74 files changed, 166 insertions(+), 163 deletions(-) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/addmm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_fc_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/batch_norm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bilinear_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/bmm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/cholesky_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/conv_transpose_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/crop_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_grad_kernel.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/depthwise_conv_kernel.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/elementwise.h (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flags_declare.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_grad_kernel.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_kernel.h (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flash_attn_utils.h (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.cc (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/flashattn.h (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/flatten2_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_conv2d_add_act_kernel_register.cu (99%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/fused_rope_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/layer_norm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lstm_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/lu_solve_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.cc (90%) rename backends/metax_gpu/kernels/{ => metax_kernel}/metax_context.h (96%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/multi_dot_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/mv_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/rank_attention_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/slogdeterminant_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_grad_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/softmax_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/solve_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/standard_gamma_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/stft_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/svd_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/top_k_grad_kernel_register.cu (100%) mode change 100755 => 100644 rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_grad_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/triangular_solve_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/warprnnt_kernel_register.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_only_linear_kernel.cu (100%) rename backends/metax_gpu/kernels/{cuda_kernels => metax_kernel}/weight_quantize_kernel_register.cu (100%) diff --git a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu index 11def2c9ee4..2aa8424f0b1 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/flash_attn_utils.h" +#include "kernels/metax_kernel/flash_attn_utils.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/calc_reduced_attn_kernel.h" diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu index d738a53f43a..44bfd02a308 100644 --- a/backends/metax_gpu/kernels/funcs/softmax.cu +++ b/backends/metax_gpu/kernels/funcs/softmax.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h index ec429950872..8c5996e680b 100644 --- a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h +++ b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h @@ -24,7 +24,7 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/common/errors.h" #endif -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" diff --git a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h index da61a1e5b41..a0f89047045 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h +++ b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "glog/logging.h" -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h" diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu index 0067818d165..b7eebfcee2e 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "kernels/gpudnn/conv_cudnn_v7.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/ddim.h" #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/dynload/cudnn.h" diff --git a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu index c115f5ad930..1c2bfeedf34 100644 --- a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "gpudnn/pool_gpudnn.h" -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/full_kernel.h" diff --git a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h index 168752700e9..5844886ad1b 100644 --- a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h +++ b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h @@ -25,7 +25,7 @@ #include "paddle/phi/kernels/primitive/kernel_primitives.h" // See Note [ Why still include the fluid headers? ] -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" diff --git a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h index 70af87513e5..c2e2e341bf5 100644 --- a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h @@ -17,7 +17,7 @@ #include #include -#include "kernels/elementwise.h" +#include "kernels/metax_kernel/elementwise.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu index 062646bbf9d..52fe5a1d566 100644 --- a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/flags.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu index bc9eb23c0e8..42810569fde 100644 --- a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/flash_attn_utils.h" #include "kernels/metax_kernel/block_attn.h" +#include "kernels/metax_kernel/flash_attn_utils.h" #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu index e8fae2d9da5..8a39ae3f0a8 100644 --- a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu @@ -20,7 +20,7 @@ limitations under the License. */ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu index e94862ec7b0..043a64dc149 100644 --- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/kernels/cross_entropy_kernel.h" #include "paddle/phi/kernels/full_kernel.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu diff --git a/backends/metax_gpu/kernels/elementwise.h b/backends/metax_gpu/kernels/metax_kernel/elementwise.h similarity index 100% rename from backends/metax_gpu/kernels/elementwise.h rename to backends/metax_gpu/kernels/metax_kernel/elementwise.h diff --git a/backends/metax_gpu/kernels/flags_declare.cu b/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu similarity index 100% rename from backends/metax_gpu/kernels/flags_declare.cu rename to backends/metax_gpu/kernels/metax_kernel/flags_declare.cu diff --git a/backends/metax_gpu/kernels/flash_attn_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_grad_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_kernel.h rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h diff --git a/backends/metax_gpu/kernels/flash_attn_utils.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_utils.h rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h diff --git a/backends/metax_gpu/kernels/flashattn.cc b/backends/metax_gpu/kernels/metax_kernel/flashattn.cc similarity index 100% rename from backends/metax_gpu/kernels/flashattn.cc rename to backends/metax_gpu/kernels/metax_kernel/flashattn.cc diff --git a/backends/metax_gpu/kernels/flashattn.h b/backends/metax_gpu/kernels/metax_kernel/flashattn.h similarity index 100% rename from backends/metax_gpu/kernels/flashattn.h rename to backends/metax_gpu/kernels/metax_kernel/flashattn.h diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu similarity index 99% rename from backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu index 48809ceefa4..c0d15b7f1b4 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu @@ -308,7 +308,8 @@ class CudnnConvDescManager { int groups, cudnnDataType_t dtype) { auto* desc = new phi::backends::gpu::ConvolutionDescriptor(); - desc->set(dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups); + desc->set( + dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups); return desc; } diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu index d7540d949a9..bdf341f5a35 100644 --- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu +++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu index db975d74665..e0c0ae9c1d6 100644 --- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/layer_norm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu index 5a2d85418a1..72e4c5b2b79 100644 --- a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu @@ -18,7 +18,7 @@ #include "paddle/phi/backends/dynload/cusolver.h" #endif -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/enforce.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu index bda5dc62f1a..d8c3355e6e4 100644 --- a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu +++ b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu @@ -18,7 +18,7 @@ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/type_traits.h" diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc similarity index 90% rename from backends/metax_gpu/kernels/metax_context.cc rename to backends/metax_gpu/kernels/metax_kernel/metax_context.cc index f0c92f00565..62aaa5fb2de 100644 --- a/backends/metax_gpu/kernels/metax_context.cc +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc @@ -12,27 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" namespace phi { const bool allow_tf32_cublas = []() -> bool { - const char* v = std::getenv("ALLOW_TF32_CUBLAS"); - if (v) { - return std::atoi(v); - } - return false; + const char* v = std::getenv("ALLOW_TF32_CUBLAS"); + if (v) { + return std::atoi(v); + } + return false; }(); const bool allow_tf32_cudnn = []() -> bool { - const char* v = std::getenv("ALLOW_TF32_CUDNN"); - if (v) { - return std::atoi(v); - } - return false; + const char* v = std::getenv("ALLOW_TF32_CUDNN"); + if (v) { + return std::atoi(v); + } + return false; }(); bool AllowTF32Cublas() { return allow_tf32_cublas; } -bool AllowTF32Cudnn() { return allow_tf32_cudnn; } +bool AllowTF32Cudnn() { return allow_tf32_cudnn; } void DnnWorkspaceHandle::RunFuncSync( const std::function& cudnn_func, diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h similarity index 96% rename from backends/metax_gpu/kernels/metax_context.h rename to backends/metax_gpu/kernels/metax_kernel/metax_context.h index 683a6df7017..a6610c1dab2 100644 --- a/backends/metax_gpu/kernels/metax_context.h +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ -#define BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ +#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ +#define BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ #include #include #include @@ -161,4 +161,4 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor, return DnnWorkspaceHandle(alloactor, stream); } } // namespace phi -#endif // BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ +#endif // BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu index 745069e2eda..c3041254444 100644 --- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu @@ -22,7 +22,7 @@ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/enforce.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc index 499832049e4..101b51aa350 100644 --- a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc +++ b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc @@ -14,7 +14,7 @@ #include "paddle/phi/kernels/rnn_grad_kernel.h" -#include "kernels/metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc index f1cf9e09dc7..2598ce093e6 100644 --- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc +++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc @@ -15,7 +15,7 @@ #include "paddle/phi/kernels/rnn_kernel.h" #include "glog/logging.h" -#include "kernels/metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/generator.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu old mode 100755 new mode 100644 similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 0283a443adb..e56826c4f3e 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -16,16 +16,16 @@ index cfada544d4..a690e97d74 100644 - set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header}) + # set(EIGEN_PATCH_COMMAND ${EIGEN_PATCH_COMMAND} && git apply ${complex_header}) endif() - + set(EIGEN_INCLUDE_DIR ${SOURCE_DIR}) diff --git a/paddle/fluid/platform/profiler/cupti_data_process.cc b/paddle/fluid/platform/profiler/cupti_data_process.cc index bff0f2bf70..9376b5781f 100644 --- a/paddle/fluid/platform/profiler/cupti_data_process.cc +++ b/paddle/fluid/platform/profiler/cupti_data_process.cc @@ -16,7 +16,7 @@ - + #include - + -#include "paddle/fluid/platform/enforce.h" +// #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/core/os_info.h" @@ -76,7 +76,7 @@ index c0080f0a5e..458ca3e2e8 100644 + __macro(cudnnDestroyActivationDescriptor); \ + __macro(cudnnSetRNNDescriptor_v6); CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) - + #if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000 @@ -152,7 +161,12 @@ CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #define CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(__macro) \ @@ -91,11 +91,11 @@ index c0080f0a5e..458ca3e2e8 100644 + __macro(cudnnRNNForwardInferenceEx); CUDNN_DNN_ROUTINE_EACH_AFTER_TWO_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif - + @@ -195,40 +209,6 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_FRONTEND(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif - + -#if CUDNN_VERSION < 90000 -#define CUDNN_DNN_ROUTINE_EACH_REMOVED_IN_E9(__macro) \ - __macro(cudnnGetRNNParamsSize); \ @@ -132,15 +132,15 @@ index c0080f0a5e..458ca3e2e8 100644 -#endif } // namespace dynload } // namespace phi - + diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h index 1547909d92..66b2779392 100644 --- a/paddle/phi/backends/dynload/cufft.h +++ b/paddle/phi/backends/dynload/cufft.h @@ -1,3 +1,4 @@ -+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. ++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - + Licensed under the Apache License, Version 2.0 (the "License"); @@ -40,7 +41,9 @@ extern void EnforceCUFFTLoaded(const char* fn_name); cufft_dso_handle = phi::dynload::GetCUFFTDsoHandle(); \ @@ -160,23 +160,23 @@ index 59e92955c9..d2f8c2da15 100644 @@ -24,8 +24,8 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/dynamic_loader.h" #include "paddle/phi/common/port.h" - + -namespace phi { -namespace dynload { +// namespace phi { +// namespace dynload { - + extern std::once_flag cupti_dso_flag; extern void *cupti_dso_handle; @@ -71,7 +71,7 @@ extern void *cupti_dso_handle; CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); - + #undef DECLARE_DYNAMIC_LOAD_CUPTI_WRAP -} // namespace dynload -} // namespace phi +// } // namespace dynload +// } // namespace phi - + -#endif // PADDLE_WITH_CUPTI +#endif // PADDLE_WITH_CUPTI \ No newline at end of file @@ -230,28 +230,28 @@ index 4ff2e528a9..81421c8ca1 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h @@ -1,3 +1,4 @@ -+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. ++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - + Licensed under the Apache License, Version 2.0 (the "License"); @@ -25,7 +26,7 @@ namespace phi { namespace backends { namespace gpu { - + -#define FULL_WARP_MASK 0xFFFFFFFF +#define FULL_WARP_MASK 0xFFFFFFFFFFFFFFFFULL #define CREATE_SHFL_MASK(mask, predicate) \ mask = __ballot_sync(FULL_WARP_MASK, (predicate)) - + @@ -45,12 +46,12 @@ namespace gpu { - + template __forceinline__ __device__ T -CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { +CudaShuffleDownSync(unsigned long long mask, T val, int delta, int width = warpSize) { return __shfl_down_sync(mask, val, static_cast(delta), width); } - + template -__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, +__forceinline__ __device__ T CudaShuffleXorSync(unsigned long long mask, @@ -259,7 +259,7 @@ index 4ff2e528a9..81421c8ca1 100644 int width = warpSize) { return __shfl_xor_sync(mask, val, width); @@ -58,14 +59,14 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, - + template <> __forceinline__ __device__ phi::dtype::float16 CudaShuffleDownSync( - unsigned mask, phi::dtype::float16 val, int delta, int width) { @@ -267,7 +267,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::float16(__shfl_down_sync( mask, val.to_half(), static_cast(delta), width)); } - + template <> __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( - unsigned mask, phi::dtype::bfloat16 val, int delta, int width) { @@ -276,7 +276,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::bfloat16(__shfl_down_sync( mask, val.to_nv_bfloat16(), static_cast(delta), width)); @@ -77,7 +78,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { @@ -285,7 +285,7 @@ index 4ff2e528a9..81421c8ca1 100644 mask, static_cast(val.real), static_cast(delta), width)); float imag = static_cast(__shfl_down_sync( @@ -87,7 +88,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - unsigned mask, phi::dtype::complex val, int delta, int width) { @@ -294,14 +294,14 @@ index 4ff2e528a9..81421c8ca1 100644 static_cast(__shfl_down_sync(mask, static_cast(val.real), @@ -103,13 +104,13 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleDownSync( - + template <> __forceinline__ __device__ phi::dtype::float16 CudaShuffleXorSync( - unsigned mask, phi::dtype::float16 val, int width) { + unsigned long long mask, phi::dtype::float16 val, int width) { return phi::dtype::float16(__shfl_xor_sync(mask, val.to_half(), width)); } - + template <> __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( - unsigned mask, phi::dtype::bfloat16 val, int width) { @@ -310,7 +310,7 @@ index 4ff2e528a9..81421c8ca1 100644 return phi::dtype::bfloat16( __shfl_xor_sync(mask, val.to_nv_bfloat16(), width)); @@ -121,7 +122,7 @@ __forceinline__ __device__ phi::dtype::bfloat16 CudaShuffleXorSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { @@ -319,7 +319,7 @@ index 4ff2e528a9..81421c8ca1 100644 __shfl_xor_sync(mask, static_cast(val.real), width)); float imag = static_cast( @@ -131,7 +132,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - + template <> __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - unsigned mask, phi::dtype::complex val, int width) { @@ -328,14 +328,14 @@ index 4ff2e528a9..81421c8ca1 100644 __shfl_xor_sync(mask, static_cast(val.real), width)); double imag = static_cast( @@ -141,7 +142,7 @@ __forceinline__ __device__ phi::dtype::complex CudaShuffleXorSync( - + template __forceinline__ __device__ T -CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { +CudaShuffleSync(unsigned long long mask, T val, int src_line, int width = 32) { return __shfl_sync(mask, val, src_line, width); } - + @@ -160,7 +161,7 @@ __device__ T reduceSum(T val, int tid, int len) { // but most card's warp size is 32. const int warpSize = 32; @@ -343,7 +343,7 @@ index 4ff2e528a9..81421c8ca1 100644 - unsigned mask = 0u; + unsigned long long mask = 0ull; CREATE_SHFL_MASK(mask, tid < len); - + for (int offset = warpSize / 2; offset > 0; offset /= 2) diff --git a/paddle/phi/core/enforce.h b/paddle/phi/core/enforce.h index 024a7de73e..1e4cdf16be 100644 @@ -351,7 +351,7 @@ index 024a7de73e..1e4cdf16be 100644 +++ b/paddle/phi/core/enforce.h @@ -45,7 +45,9 @@ limitations under the License. */ #endif - + #ifdef PADDLE_WITH_CUDA -#include "paddle/phi/backends/dynload/cublas.h" +// #include "paddle/phi/backends/dynload/../../../../../cublas.h" @@ -361,9 +361,9 @@ index 024a7de73e..1e4cdf16be 100644 #include "paddle/phi/backends/dynload/curand.h" #include "paddle/phi/backends/dynload/cusolver.h" @@ -97,7 +99,7 @@ inline bool is_error(bool stat) { return !stat; } - + void ThrowWarnInternal(const std::string& message); - + -#if defined(__CUDA_ARCH__) +#if defined(__CUDACC__) // For cuda, the assertions can affect performance and it is therefore @@ -379,7 +379,7 @@ index 024a7de73e..1e4cdf16be 100644 } while (0) #elif defined(__HIPCC__) @@ -757,4 +759,4 @@ inline void retry_sleep(unsigned millisecond) { - + } // namespace enforce using namespace enforce; // NOLINT -} // namespace phi @@ -392,7 +392,7 @@ index c646e487d0..325122175c 100644 @@ -25,8 +25,9 @@ #else #include - + -#include "paddle/phi/backends/dynload/cublas.h" -#include "paddle/phi/backends/dynload/cublasLt.h" +// #include "paddle/phi/backends/dynload/cublas.h" @@ -400,16 +400,16 @@ index c646e487d0..325122175c 100644 +// #include "paddle/phi/backends/dynload/cublasLt.h" #include "paddle/phi/backends/dynload/cudnn.h" #endif - + @@ -90,7 +91,7 @@ DECLARE_TYPE_FOR_GPU(gpuStreamCaptureMode, - + // TODO(Ming Huang): Since there is no blasLt handler, // use rocblas_handle for workaround. -DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); +// DECLARE_TYPE_FOR_GPU(blasLtHandle_t, cublasLtHandle_t, rocblas_handle); - + #undef DECLARE_TYPE_FOR_GPU - + diff --git a/paddle/phi/core/platform/device_context.h b/paddle/phi/core/platform/device_context.h index 2d02eb370b..8a7233e34e 100644 --- a/paddle/phi/core/platform/device_context.h @@ -430,58 +430,58 @@ index d69eb67d6f..1d8b6e9375 100644 --- a/paddle/phi/kernels/cpu/index_select_impl.h +++ b/paddle/phi/kernels/cpu/index_select_impl.h @@ -18,7 +18,7 @@ - + #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" - + diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu index cb35feee32..64f5bd24ac 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu +++ b/paddle/phi/kernels/funcs/fc_functor.cu @@ -16,12 +16,12 @@ limitations under the License. */ - + #include "paddle/phi/backends/all_context.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/fc_functor.h" - + #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" +// #include "paddle/phi/kernels/funcs/blas/blaslt_impl.cu.h" #include "paddle/phi/kernels/funcs/quant_dequant.h" #include "paddle/phi/kernels/matmul_kernel.h" - + diff --git a/paddle/phi/kernels/funcs/gru_compute.cu b/paddle/phi/kernels/funcs/gru_compute.cu index 88663ec880..98b93072a3 100644 --- a/paddle/phi/kernels/funcs/gru_compute.cu +++ b/paddle/phi/kernels/funcs/gru_compute.cu @@ -12,7 +12,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/gru_compute.h" - + #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h" #include "paddle/phi/kernels/funcs/detail/gru_kernel.h" - + diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h index 15e1a4a3c3..e4780538d7 100644 --- a/paddle/phi/kernels/funcs/math/context_project.h +++ b/paddle/phi/kernels/funcs/math/context_project.h @@ -18,7 +18,7 @@ #include - + #include "paddle/phi/core/tensor_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/im2col.h" - + namespace phi { diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu index e101224970..a52eb6096f 100644 @@ -489,14 +489,14 @@ index e101224970..a52eb6096f 100644 +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu @@ -15,11 +15,13 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + #include "paddle/phi/common/memory_utils.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" - + namespace phi { namespace funcs { - + + + template @@ -514,19 +514,19 @@ index 558d363b39..05da04b517 100644 +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/scatter.cu.h" - + diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu index 8b0baf5f5f..260482f124 100644 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu @@ -27,7 +27,7 @@ namespace cub = hipcub; - + #include "paddle/phi/kernels/funcs/multihead_matmul_functor.h" - + -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_cuda_utils.h" - + namespace phi { diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h index e30d440ff3..3c74792690 100644 @@ -535,7 +535,7 @@ index e30d440ff3..3c74792690 100644 @@ -30,11 +30,11 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" #include "paddle/phi/kernels/primitive/functor_primitives.h" - + -#define FINAL_MASK 0xffffffff +#define FINAL_MASK 0xffffffffffffffffull #ifdef PADDLE_WITH_HIP @@ -545,7 +545,7 @@ index e30d440ff3..3c74792690 100644 +#define WARP_SIZE 64 #endif #define MAX_NUM_THREADS 1024 - + @@ -196,21 +196,56 @@ __device__ __forceinline__ void AddTo(Pair topk[], for (int k = beam_size - 2; k >= 0; k--) { if (largest) { @@ -606,7 +606,7 @@ index e30d440ff3..3c74792690 100644 + topk[0 + offset].v = p.v; + topk[0 + offset].id = p.id; } - + template @@ -239,24 +274,24 @@ __device__ __forceinline__ void GetTopK(Pair topk[], template @@ -662,7 +662,7 @@ index e30d440ff3..3c74792690 100644 + // topk + MaxLength - *beam, src, tid, dim, *max, length, largest); } } - + @@ -355,6 +394,8 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], shared_max[wid] = input_now; } @@ -697,7 +697,7 @@ index e30d440ff3..3c74792690 100644 - if (--(*k) == 0) break; + // if (--(*k) == 0) break; + unsigned long long mask = 0ull; - + - unsigned mask = 0u; + // unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); @@ -721,14 +721,14 @@ index e30d440ff3..3c74792690 100644 + return ret; } - + static __device__ __forceinline__ unsigned int SetBitfield( unsigned int val, unsigned int to_insert, int pos, int len) { unsigned int ret; - asm("bfi.b32 %0, %1, %2, %3, %4;" - : "=r"(ret) - : "r"(to_insert), "r"(val), "r"(pos), "r"(len)); -+ ++ + ret = (static_cast(val) << (32 - pos - len)) >> (32 - len); return ret; } @@ -738,12 +738,12 @@ index e30d440ff3..3c74792690 100644 int len) { uint64_t ret; - asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len)); -+ ++ + + ret = (static_cast(val) << (64 - pos - len)) >> (64 - len); return ret; } - + @@ -507,9 +556,9 @@ struct Bitfield { int pos, int len) { @@ -751,7 +751,7 @@ index e30d440ff3..3c74792690 100644 - asm("bfi.b64 %0, %1, %2, %3, %4;" - : "=l"(ret) - : "l"(to_insert), "l"(val), "r"(pos), "r"(len)); -+ ++ + ret = (static_cast(val) << (64 - pos - len)) >> (64 - len); + return ret; @@ -763,7 +763,7 @@ index e30d440ff3..3c74792690 100644 int lane_id; - asm("mov.s32 %0, %%laneid;" : "=r"(lane_id)); - return lane_id; -+ ++ +// // >>>> PTX2CPP Success <<<< +// { +// (lane_id)=(threadIdx.x&(warpSize-1)); @@ -771,7 +771,7 @@ index e30d440ff3..3c74792690 100644 + return ::__lane_id(); + // return lane_id; } - + __device__ __forceinline__ unsigned GetLaneMaskLe() { unsigned mask; - asm("mov.u32 %0, %%lanemask_le;" : "=r"(mask)); @@ -780,17 +780,17 @@ index e30d440ff3..3c74792690 100644 + return ((uint64_t(1) << ::__lane_id()) << 1) - 1; + // return mask; } - + template @@ -881,7 +936,8 @@ __global__ void GatherKthValue(const T* input, - + // 1. Find the k-th value T kth_value = static_cast(0); - RadixSearch::RadixType, IndexType, false>( + // RadixSearch::RadixType, IndexType, false>( + RadixSearch::RadixType, IndexType, false>( cur_input, k, num_cols, shared_mem, &kth_value); - + __shared__ int64_t block_min_idx; @@ -1314,3 +1370,4 @@ bool SortTopk(const phi::GPUContext& dev_ctx, } @@ -803,12 +803,12 @@ index 32db61532f..0220316bc3 100644 +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h @@ -15,7 +15,7 @@ #pragma once - + #if defined(PADDLE_WITH_CUDA) -#include "paddle/phi/backends/dynload/cublasLt.h" +// #include "paddle/phi/backends/dynload/cublasLt.h" #endif - + #include "glog/logging.h" diff --git a/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h b/paddle/phi/kernels/fusion/gpu/fused_layernorm_residual_dropout_bias.h index 9d4bb18d55..ea42cc10a9 100644 @@ -830,12 +830,12 @@ index b8cfdbf3ce..fa14b94a77 100644 --- a/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/masked_multihead_attention_kernel.cu @@ -14,7 +14,7 @@ - + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" +#include "kernels/metax_kernel/mmha_util.cu.h" - + namespace phi { namespace fusion { diff --git a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu @@ -843,12 +843,12 @@ index e838778952..83e805e75a 100644 --- a/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/qkv_unpack_mha_kernel.cu @@ -14,7 +14,7 @@ - + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" -#include "paddle/phi/kernels/fusion/gpu/mmha_util.cu.h" +#include "kernels/metax_kernel/mmha_util.cu.h" - + namespace phi { namespace fusion { diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -863,7 +863,7 @@ index f0cca0f701..02ea957240 100644 -#include "paddle/phi/kernels/impl/conv_cudnn_impl.h" +#include "kernels/gpudnn/conv_gpudnn.h" +#include "kernels/impl/conv_cudnn_impl.h" - + namespace phi { // To determine use cudnn or not. diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h @@ -890,7 +890,7 @@ index 29fa252e96..4ae72b0935 100644 +// #endif return tanhf(x); } - + diff --git a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu index 11efd87965..679db14c24 100644 --- a/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu @@ -901,9 +901,9 @@ index 11efd87965..679db14c24 100644 #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" +#include "kernels/gpudnn/softmax_gpudnn.h" - + namespace phi { - + diff --git a/paddle/phi/kernels/gpu/log_softmax_kernel.cu b/paddle/phi/kernels/gpu/log_softmax_kernel.cu index 63c35dd4ee..15da9aea45 100644 --- a/paddle/phi/kernels/gpu/log_softmax_kernel.cu @@ -914,9 +914,9 @@ index 63c35dd4ee..15da9aea45 100644 #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" +#include "kernels/gpudnn/softmax_gpudnn.h" - + namespace phi { - + diff --git a/paddle/phi/kernels/gpu/lstsq_kernel.cu b/paddle/phi/kernels/gpu/lstsq_kernel.cu index 1bdbe1564c..f753b54bc6 100644 --- a/paddle/phi/kernels/gpu/lstsq_kernel.cu @@ -948,7 +948,7 @@ index cf80666b4e..ca76e055fb 100644 --- a/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/baddbmm_grad_kernel_impl.h @@ -19,7 +19,7 @@ limitations under the License. */ - + #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/baddbmm_grad_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" @@ -961,14 +961,14 @@ index 2789cb59a2..b91b076f7f 100644 --- a/paddle/phi/kernels/impl/baddbmm_kernel_impl.h +++ b/paddle/phi/kernels/impl/baddbmm_kernel_impl.h @@ -20,7 +20,7 @@ limitations under the License. */ - + #include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/baddbmm_kernel.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" - + diff --git a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h index 9a21c23666..86413d1577 100644 --- a/paddle/phi/kernels/impl/conv_transpose_grad_kernel_impl.h @@ -993,7 +993,7 @@ index 4459a931da..837c8682b8 100644 -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/deformable_conv_functor.h" - + namespace phi { diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h index ad9e9197dd..5478d9817d 100644 @@ -1013,27 +1013,27 @@ index e6b3960f6d..564125f1f6 100644 --- a/paddle/phi/kernels/impl/gammaincc_kernel_impl.h +++ b/paddle/phi/kernels/impl/gammaincc_kernel_impl.h @@ -56,8 +56,8 @@ HOSTDEVICE T igam(const T a, const T x) { - + template HOSTDEVICE T igamc(const T a, const T x) { - static T big = 4.503599627370496e15; - static T biginv = 2.22044604925031308085e-16; + const static T big = 4.503599627370496e15; + const static T biginv = 2.22044604925031308085e-16; - + if ((x <= T{0}) || (a <= T{0})) return (T{1.0}); - + diff --git a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h index 410fb3c560..009ce03440 100644 --- a/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/gammaln_grad_kernel_impl.h @@ -54,7 +54,7 @@ HOSTDEVICE T digamma_positive_domain(T x) { - + template HOSTDEVICE T digamma(T x) { - static T pi = T{3.14159265358979323846}; + const static T pi = T{3.14159265358979323846}; - + if (x == T{0.0}) { T inf = std::numeric_limits::infinity(); diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h @@ -1048,12 +1048,12 @@ index 5ebbc8d2db..48acf8d0cd 100644 -#include "paddle/phi/kernels/funcs/quant_dequant.h" +#include "kernels/funcs/blas/cublaslt.h" +#include "kernels/funcs/quant_dequant.h" -+#include "kernels/metax_context.h" - ++#include "kernels/metax_kernel/metax_context.h" + #pragma once - + @@ -668,7 +669,7 @@ void LLMGemm(const phi::GPUContext& dev_ctx, - + { auto helper = - std::make_unique(m, k, n, dev_ctx.cublaslt_handle()); @@ -1067,12 +1067,12 @@ index 1f319c4ae3..9186eb6906 100644 +++ b/paddle/phi/kernels/impl/matrix_power_grad_kernel_impl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once - + #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + namespace phi { diff --git a/paddle/phi/kernels/impl/matrix_power_kernel_impl.h b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h index 6f03f76eeb..5fe2c3e7dc 100644 @@ -1080,13 +1080,13 @@ index 6f03f76eeb..5fe2c3e7dc 100644 +++ b/paddle/phi/kernels/impl/matrix_power_kernel_impl.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once - + #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" - + diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h index 7b85903776..3f4b298807 100644 --- a/paddle/phi/kernels/impl/merged_momentum_impl.h @@ -1118,14 +1118,14 @@ index 4099d8b506..baef2cd643 100644 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h +++ b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h @@ -14,7 +14,7 @@ - + #pragma once - + -#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" - + diff --git a/third_party/flagcx b/third_party/flagcx index 7c469f4af9..7e6c4cc3ca 160000 --- a/third_party/flagcx diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index 37475773026..410ef006514 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -87,32 +87,34 @@ list( list( REMOVE_ITEM PYTHON_TEST_SCRIPTS - ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py # 精度问题 - ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py # 受 test_sum_op.py 影响 - ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py # 精度问题 - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py # core.cudnnversion - # 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py # core.cudnnversion 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py # core.cudnnversion 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py # op_test.py 里 - # self._get_places() - # 接口适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py # device == "gpu" 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py # core.cudnnversion 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py # paddle.device.cuda.get_device_properties - ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py # needs check_grad with fp64 - # precision - ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py # op_test.py 里 - # self._get_places() 接口适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py # CUDAPinnedPlace 问题 - ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py # paddle.device.cuda.get_device_properties - ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py # CUDAPinnedPlace 问题 - ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py # paddle.device.cuda.get_device_properties -) + # 精度问题 + ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py + # core.cudnnversion + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py + # op_test.py 里 self._get_places()接口适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py + # device == "gpu" 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py + # paddle-gpu 报错一致 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py + # paddle.device.cuda.get_device_properties + ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py + # needs check_grad with fp64 precision + ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py + # CUDAPinnedPlace 问题 + ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py) list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS) foreach(test_script ${PYTHON_TEST_SCRIPTS}) From c05f95f939c84359cc5bee3e3d88cb7235f55e83 Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Fri, 19 Sep 2025 14:57:09 +0800 Subject: [PATCH 4/5] [Metax] add log analysis script --- .../metax_gpu/tests/scripts/classify.json | 22 ++ .../metax_gpu/tests/scripts/log_analysis.py | 216 ++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 backends/metax_gpu/tests/scripts/classify.json create mode 100644 backends/metax_gpu/tests/scripts/log_analysis.py diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json new file mode 100644 index 00000000000..b97255adc3d --- /dev/null +++ b/backends/metax_gpu/tests/scripts/classify.json @@ -0,0 +1,22 @@ +{ + "OK":{ + "skipped":{ + "rule":["skipped="] + } + }, + + "FAILED":{ + "precision":{ + "rule":["Mismatched elements"] + }, + "api":{ + "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"] + }, + "missing":{ + "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"] + }, + "file_not_found":{ + "rule":["FileNotFoundError:"] + } + } +} diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py new file mode 100644 index 00000000000..c0716f5b6f5 --- /dev/null +++ b/backends/metax_gpu/tests/scripts/log_analysis.py @@ -0,0 +1,216 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import fnmatch +import shutil +from enum import Enum + + +class TestResult(Enum): + OK = "OK" + FAILURE = "FAILED" + + +class LogAnalyzer: + def __init__( + self, + classify_file: str, + search_path: str, + pattern: str = None, + encoding: str = "utf-8", + ): + self.__patten = pattern + self.__search_path = search_path + self.__encoding = encoding + self.__statistical_data = {} + + self.__classify_data = self.__read_json_file(classify_file) + for key, value in self.__classify_data.items(): + self.__statistical_data[key] = {} + for sub_key in list(value.keys()): + self.__statistical_data[key][sub_key] = [] + + self.__statistical_data[TestResult.OK.value]["noskip"] = [] + self.__statistical_data[TestResult.FAILURE.value]["other"] = [] + + def __read_json_file(self, path: str) -> dict: + with open(path, "r", encoding=self.__encoding) as f: + data = json.load(f) + f.close() + return data + + def __check_path(self, path: str) -> None: + """ + 处理指定路径: + - 若为文件夹路径:不存在则创建,存在则清空内容 + - 若为文件路径:不存在则创建,存在则清空内容 + """ + try: + # 判断路径是否存在 + if os.path.exists(path): + # 路径存在,判断是文件还是文件夹 + if os.path.isfile(path): + # 处理文件:清空内容 + with open(path, "w", encoding="utf-8") as f: + f.write("") # 写入空内容清空文件 + # print(f"文件已存在,已清空内容: {path}") + + elif os.path.isdir(path): + # 处理文件夹:清空所有内容 + for item in os.listdir(path): + item_path = os.path.join(path, item) + if os.path.isfile(item_path) or os.path.islink(item_path): + os.remove(item_path) # 删除文件或链接 + elif os.path.isdir(item_path): + shutil.rmtree(item_path) # 递归删除子文件夹 + # print(f"文件夹已存在,已清空内容: {path}") + else: + # 路径不存在,判断目标类型(根据最后一个元素是否有扩展名) + # 获取路径的最后一部分 + last_part = os.path.basename(path) + + # 判断是否为文件路径(包含扩展名) + if "." in last_part and not last_part.endswith("."): + # 创建文件(包括父目录) + parent_dir = os.path.dirname(path) + if parent_dir and not os.path.exists(parent_dir): + os.makedirs(parent_dir, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + pass # 创建空文件 + # print(f"文件不存在,已创建: {path}") + + else: + # 创建文件夹(支持多级目录) + os.makedirs(path, exist_ok=True) + # print(f"文件夹不存在,已创建: {path}") + + except PermissionError: + print(f"权限错误:无法操作路径 {path}") + except Exception as e: + print(f"处理路径时发生错误: {str(e)}") + + def save_result(self, dir_path: str = "./") -> None: + """ + 判断文件夹是否存在: + - 不存在则创建 + - 存在则清空文件夹内所有内容(保留文件夹本身) + """ + + for key, value in self.__statistical_data.items(): + sub_dir = os.path.join(dir_path, key) + self.__check_path(sub_dir) + + for sub_key, sub_value in value.items(): + # print(f"{sub_key}: {len(value[sub_key])} - ({sub_value})") + try: + with open( + os.path.join(sub_dir, sub_key) + ".txt", "w", encoding="utf-8" + ) as f: + for op_name in sub_value: + if not op_name.endswith("\n"): + op_name += "\n" + f.write(op_name) + # print(f"内容已成功{'追加' if append else '写入'}到 {file_path}") + except Exception as e: + print(f"写入文件失败: {e}") + + def show_result(self) -> None: + test_counts = 0 + for key, value in self.__statistical_data.items(): + print(f"\n---------- {key} ----------") + for sub_key, sub_value in value.items(): + test_counts = test_counts + len(value[sub_key]) + print(f"{sub_key}: {len(value[sub_key])}\n\t{sub_value}\n") + print( + f"\n******************* Total log num: {test_counts} *******************\n\n" + ) + + def run(self): + """ + 读取指定目录下符合命名规则的文件,并遍历每一行 + + 参数: + search_path: 要搜索的根目录 + pattern: 文件名匹配规则(支持通配符,如 '*.txt', 'file_*.log') + """ + for dirpath, dirnames, filenames in os.walk(self.__search_path): + for filename in fnmatch.filter(filenames, self.__patten): + file_path = os.path.join(dirpath, filename) + # print(f"\n===== 正在处理文件: {file_path} =====") + + cur_res_type = TestResult.FAILURE + cur_sub_type = "other" + pre_line = None + finish_early = False + + try: + with open(file_path, "r", encoding=self.__encoding) as f: + for line in f: + for sub_type, sub_type_params in self.__classify_data[ + cur_res_type.value + ].items(): + for keyword in sub_type_params["rule"]: + if keyword in line: + cur_sub_type = sub_type + if sub_type == "missing": + finish_early = True + break + + if finish_early: + break + + pre_line = line + if finish_early: + break + + if "OK" in pre_line: + cur_res_type = TestResult.OK + cur_sub_type = None + for sub_type, sub_type_params in self.__classify_data[ + cur_res_type.value + ].items(): + for rule in sub_type_params["rule"]: + if rule in line: + cur_sub_type = sub_type + + op_name = filename.split(".") + if cur_sub_type is None: + self.__statistical_data[cur_res_type.value][ + "noskip" + ].append(op_name[0]) + else: + self.__statistical_data[cur_res_type.value][ + cur_sub_type + ].append(op_name[0]) + # print(f"Result: {cur_res_type.value}, type: {cur_sub_type}") + f.close() + except UnicodeDecodeError: + print(f"警告: 文件 {file_path} 编码不是 utf-8,跳过处理") + except Exception as e: + print(f"处理文件 {file_path} 时出错: {str(e)}") + + +if __name__ == "__main__": + + analyzer = LogAnalyzer( + classify_file="./classify.json", + search_path="./NPU_logs/20250918_065326", + pattern="test_*.log", + ) + + analyzer.run() + analyzer.show_result() + analyzer.save_result("./output") From 465f54ac8a8ae5f89f045f87c7aaaed5088488ef Mon Sep 17 00:00:00 2001 From: "Mingkun.Zhang" <2496808993@qq.com> Date: Mon, 22 Sep 2025 17:29:30 +0800 Subject: [PATCH 5/5] [Metax] update metax backend CI test --- backends/metax_gpu/tests/CMakeLists.txt | 192 +++++++++++------------- backends/metax_gpu/tests/default.txt | 67 +++++++++ backends/metax_gpu/tests/run_test.sh | 56 ++++++- 3 files changed, 202 insertions(+), 113 deletions(-) create mode 100644 backends/metax_gpu/tests/default.txt diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index 795a3c5b8ac..ded54233f24 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -11,117 +11,95 @@ set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test) file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py") -list( - APPEND - PYTHON_TEST_SCRIPTS - ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_where_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_split_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_fill_constant_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_empty_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_sign_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_unbind_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_put_along_axis_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_maximum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_accuracy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_strided_slice_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_set_value_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_flatten_contiguous_range_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_top_k_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_subtract_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_greater_equal_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_top_k_v2_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_one_hot_v2_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_fill_any_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_reshape_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_bitwise_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_pad_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cast_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_zeros_like_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_shape_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_tril_triu_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_index_put_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_bincount_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_assign_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_squared_l2_norm_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_fused_bias_act_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_expand_v2_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_adamw_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_gather_nd_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_concat_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scatter_nd_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_floordiv_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_mul_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_einsum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_numel_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scale_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_full_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_scatter_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_clip_op.py) - -list( - REMOVE_ITEM - PYTHON_TEST_SCRIPTS - # 精度问题 - ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py - # core.cudnnversion - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py - # op_test.py 里 self._get_places()接口的适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py - # device == "gpu" 适配问题 - ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py - # paddle-gpu 报错一致 - ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py - # paddle.device.cuda.get_device_properties - ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py - # needs check_grad with fp64 precision - ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py - # CUDAPinnedPlace 问题 - ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py - ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py) +if(NOT TEST_LIST_FILE) + message( + STATUS + " is not set, default test list [ ${CMAKE_CURRENT_LIST_DIR}/default.txt ] will be used." + ) + file(STRINGS ${CMAKE_CURRENT_LIST_DIR}/default.txt TEST_PROGRAMS) + +else() + if(NOT EXISTS ${TEST_LIST_FILE}) + message(FATAL_ERROR " is not exist, please check it again.") + endif() + + file(STRINGS ${TEST_LIST_FILE} TEST_PROGRAMS) + + if(NOT TEST_PROGRAMS) + message(FATAL_ERROR " is empty.") + endif() + + set(PYTHON_TEST_SCRIPTS "") +endif() + +foreach(test_name ${TEST_PROGRAMS}) + set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py) + if(NOT EXISTS ${CURRENT_TEST_PROGRAM}) + message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.") + else() + list(APPEND PYTHON_TEST_SCRIPTS ${CURRENT_TEST_PROGRAM}) + endif() +endforeach() list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS) + +if(NOT TEST_LIST_FILE) + list( + REMOVE_ITEM + PYTHON_TEST_SCRIPTS + # 精度问题 + ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py + # core.cudnnversion + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py + # op_test.py 里 self._get_places()接口的适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py + # device == "gpu" 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py + # paddle-gpu 报错一致 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py + # paddle.device.cuda.get_device_properties + ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py + # needs check_grad with fp64 precision + ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py + # CUDAPinnedPlace 问题 + ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py) +endif() + +if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR}) + file(MAKE_DIRECTORY ${LOG_OUTPUT_DIR}) + message(WARNING "${LOG_OUTPUT_DIR} is not exist, create it now.") +endif() + foreach(test_script ${PYTHON_TEST_SCRIPTS}) get_filename_component(test_name ${test_script} NAME_WE) - add_test( - NAME "python_${test_name}" - COMMAND ${Python_EXECUTABLE} ${test_script} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + if(LOG_OUTPUT_DIR) + set(test_log_file "${LOG_OUTPUT_DIR}/${test_name}.log") + + add_test( + NAME "python_${test_name}" + COMMAND sh -c + "${Python_EXECUTABLE} ${test_script} > ${test_log_file} 2>&1" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + + else() + add_test( + NAME "python_${test_name}" + COMMAND ${Python_EXECUTABLE} ${test_script} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360) endforeach() diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt new file mode 100644 index 00000000000..8e2c3bcdd7e --- /dev/null +++ b/backends/metax_gpu/tests/default.txt @@ -0,0 +1,67 @@ +test_accuracy_op +test_tril_triu_op +test_where_op +test_split_op +test_fill_constant_op +test_empty_op +test_sign_op +test_cast_op +test_index_add_op +test_unbind_op +test_put_along_axis_op +test_layer_norm_op +test_maximum_op +test_accuracy_op +test_strided_slice_op +test_sum_op +test_set_value_op +test_flatten_contiguous_range_op +test_top_k_op +test_subtract_op +test_softmax_op +test_cumsum_op +test_greater_equal_op +test_elementwise_div_op +test_top_k_v2_op +test_stack_op +test_one_hot_v2_op +test_fill_any_op +test_gather_op +test_reshape_op +test_index_put_op +test_bitwise_op +test_max_op +test_pad_op +test_elementwise_pow_op +test_uniform_random_op +test_scatter_op +test_cast_op +test_zeros_like_op +test_compare_op +test_shape_op +test_tril_triu_op +test_slice_op +test_elementwise_add_op +test_index_put_op +test_bincount_op +test_assign_op +test_logical_op +test_squared_l2_norm_op +test_mean_op +test_fused_bias_act_op +test_expand_v2_op +test_adamw_op +test_gather_nd_op +test_concat_op +test_scatter_nd_op +test_elementwise_floordiv_op +test_elementwise_mul_op +test_transpose_op +test_einsum_op +test_randint_op +test_c_embedding_op +test_numel_op +test_scale_op +test_softmax_with_cross_entropy_op +test_full_op +test_scatter_op diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh index 7d1e8e072a9..b9e8ec5b5cc 100755 --- a/backends/metax_gpu/tests/run_test.sh +++ b/backends/metax_gpu/tests/run_test.sh @@ -2,13 +2,13 @@ #!/bin/bash # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,10 +29,54 @@ export rm -r build mkdir -p build && cd build -cmake .. +TEST_LOG_LEVEL=0 +TEST_LIST_FILE="" +TEST_LOG_OUTPUT_DIR="" +TEST_PARALLEL_NUM=10 -cmake --build . +while getopts "i:o:v:j:h" opt; do + case "$opt" in + i) + TEST_LIST_FILE="$OPTARG" + ;; + o) + TEST_LOG_OUTPUT_DIR="$OPTARG" + echo "Set log output dir [ $TEST_LOG_OUTPUT_DIR ]" + ;; + v) + TEST_LOG_LEVEL=$OPTARG + ;; + j) + TEST_PARALLEL_NUM="$OPTARG" + ;; + h) + echo "用法:$0 -i <测试列表文件> -o <日志输出路径> ..." + echo "选项说明:" + echo " -i 测试程序列表文件" + echo " -o 日志输出路径" + echo " -v GLOG_v 日志等级" + echo " -j ctest 测试并行数量" + echo " -h 显示帮助" + exit 0 + ;; + \?) + echo "error: unknow option '-$OPTARG'." + exit 1 + ;; + :) + echo "error option '-$OPTARG' must have parameter." + exit 1 + ;; + esac +done + + +export GLOG_v=$TEST_LOG_LEVEL -ctest -j10 --output-on-failure +cmake .. -DTEST_LIST_FILE=$TEST_LIST_FILE -DLOG_OUTPUT_DIR=$TEST_LOG_OUTPUT_DIR + +cmake --build . + +ctest -j$TEST_PARALLEL_NUM --output-on-failure