diff --git a/.github/workflows/metax_work.yaml b/.github/workflows/metax_work.yaml new file mode 100644 index 00000000000..51c0c62cef6 --- /dev/null +++ b/.github/workflows/metax_work.yaml @@ -0,0 +1,54 @@ +name: padlle metax gpu test + +on: + workflow_dispatch: + pull_request: + types: [opened, synchronize] + branches: [develop, release/**] + paths: + - "**" + - "!backends/**" + - "backends/metax_gpu/**" + +permissions: read-all + +defaults: + run: + shell: bash + +jobs: + metax-gpu-test: + runs-on: paddle-metax-runner-set + # runs-on: debug-paddle-runner-set + steps: + - name: Checkout repository + run: | + git config --global user.name "GitHub Actions" + git config --global user.email "actions@github.com" + + git clone \ + --reference-if-able /home/runner/PaddleCustomDevice \ + --depth=1 \ + --shallow-submodules \ + --jobs=8 \ + --branch ${{ github.base_ref || github.ref_name}} \ + --recurse-submodules \ + https://${{ github.actor }}:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}.git . + + if [ "${{ github.event_name }}" == "pull_request" ]; then + git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/head + git checkout pull/${{ github.event.pull_request.number }}/head + + # git submodule update --init --recursive + fi + + + - name: compile + run: | + cd backends/metax_gpu + bash build.sh + + - name: run test + run: | + cd backends/metax_gpu/tests + bash run_test.sh diff --git a/Paddle b/Paddle index 2008f339e5a..a8b4de5f626 160000 --- a/Paddle +++ b/Paddle @@ -1 +1 @@ -Subproject commit 2008f339e5a6c687c7d3c5688ae892b56524c89e +Subproject commit a8b4de5f6260e598d6426f7778364d1277b2ad76 diff --git a/backends/metax_gpu/CMakeLists.txt b/backends/metax_gpu/CMakeLists.txt index 6048b59e6c1..b98f2bcc919 100755 --- a/backends/metax_gpu/CMakeLists.txt +++ b/backends/metax_gpu/CMakeLists.txt @@ -37,6 +37,8 @@ include(cblas) include(flashattn) include(cutlass) include(dgc) +include(warpctc) +include(warprnnt) set(PLUGIN_VERSION ${PADDLE_VERSION}) @@ -612,12 +614,9 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/math_function.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/log_softmax_grad_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/backends/context_pool.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/funcs/repeat_tensor2index_tensor.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/binomial_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bernoulli_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_grad_kernel_impl.h - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/bmm_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/backends/dynload/cufft.cc ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/box_coder_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu @@ -640,31 +639,12 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gather_tree_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/graph_reindex_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/group_norm_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_act_dequant_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/block_multi_head_attention_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_weighted_swiglu_act_quant_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_upper_triangle_grad_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/fp8_gemm/fp8_gemm_with_cublasLt/fp8_fp8_half_gemm.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_grad_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/fused_conv2d_add_act_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/variable_length_memory_efficient_attention_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/cutlass/gemm_epilogue_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/blha_get_max_len.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_elemwise_activation_grad_kernel.cu - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_real_kernel.cc - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/as_complex_kernel.cc - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_grad_kernel.cc - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/stride/complex_kernel.cc - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/shape_kernel.cc - # ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/sparse/gpu/conv_kernel_igemm.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu + ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu # ############################################################################ - # kernels/fusion kernels/selected_rows ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu # kernels/kps ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/kps/elementwise_kernel.cu @@ -696,7 +676,6 @@ file( ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/cal_aux_loss_grad_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/expand_modality_expert_id_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/legacy/gpu/int_bincount_kernel.cu - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/fusion/gpu/fused_bias_act_kernel.cu) file( @@ -706,15 +685,14 @@ file( passes/*.cc kernels/*.cc kernels/*.cu + kernels/fusion/*.cc + kernels/fusion/*.cu kernels/gpudnn/*.cc kernels/gpudnn/*.cu kernels/cuda_kernels/*.cc kernels/cuda_kernels/*.cu kernels/funcs/blas/*.cc - kernels/ernie_core/*.cu - kernels/ernie_core/rms_norm_kernel_register.cu - kernels/ernie_core/top_p_sampling_kernel_register.cu - kernels/ernie_core/fused_bias_act_kernel_register.cu) + kernels/ernie_core/*.cu) set(CUSTOM_DEVICE_SRCS ${CUDA_SRCS} ${CC_SRCS} ${ERNIE_CORE_SRCS}) @@ -723,18 +701,17 @@ set_source_files_properties(${CUSTOM_DEVICE_SRCS} PROPERTIES LANGUAGE CUDA) set(CMAKE_CUCC_COMPILER "cucc") set(CMAKE_CUCC_FLAGS "-I /opt/maca/tools/cu-bridge/include/") -set_source_files_properties( - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu - PROPERTIES LANGUAGE CUDA) -add_library( - ${TARGET_NAME} SHARED - ${CUSTOM_DEVICE_SRCS} - ${PADDLE_SOURCE_DIR}/paddle/phi/kernels/gpu/rms_norm_kernel.cu) +add_library(${TARGET_NAME} SHARED ${CUSTOM_DEVICE_SRCS}) target_include_directories( ${TARGET_NAME} - PRIVATE ${PADDLE_SOURCE_DIR} ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/kernels - ${CUDA_INCLUDE_DIRS} ${PADDLE_SOURCE_DIR}/third_party/pybind/include + PRIVATE ${PADDLE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR} + ${CMAKE_SOURCE_DIR}/kernels + ${CUDA_INCLUDE_DIRS} + ${WARPCTC_INCLUDE_DIR} + ${WARPRNNT_INCLUDE_DIR} + ${PADDLE_SOURCE_DIR}/third_party/pybind/include ${PADDLE_SOURCE_DIR}/paddle/phi/api/include/compat) target_link_libraries( @@ -747,16 +724,16 @@ target_link_libraries( protobuf external_error_proto dgc + ${WARPCTC_LIBRARIES} + ${WARPRNNT_LIBRARIES} ${PADDLE_CORE_LIB}) -target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmccl.so) -target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcFlashAttn.so) -target_link_libraries(${TARGET_NAME} /opt/maca/lib/libmcpti.so) include_directories(BEFORE ${PADDLE_SOURCE_DIR}) target_compile_definitions( ${TARGET_NAME} PUBLIC PADDLE_WITH_CUDA=1 PADDLE_WITH_CUSTOM_DEVICE=1 + mcblasContext=cublasContext GPUContext=CustomContext KPSContext=CustomContext STREAM_TYPE=cudaStream_t diff --git a/backends/metax_gpu/build.sh b/backends/metax_gpu/build.sh index dd0ab3aab90..9ca589a7807 100755 --- a/backends/metax_gpu/build.sh +++ b/backends/metax_gpu/build.sh @@ -20,15 +20,23 @@ set -e pip uninstall paddlepaddle -y -export http_proxy=http://10.2.192.21:1080 https_proxy=http://10.2.192.21:1080 +# init paddle +# git submodule sync --recursive && git submodule update --init --recursive + +# sleep 1000000 +# unset http_proxy https_proxy + + +# export http_proxy=https://172.17.0.1:1080 https_proxy=http://10.2.192.21:1080 +# export pip install safetensors==0.6.2 -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple some-package # install paddle + + python -m pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ -# exit 1 -# init paddle -git submodule sync --recursive && git submodule update --init --recursive +# unset http_proxy https_proxy # apply patch bash change_patch.sh @@ -49,8 +57,8 @@ fi echo "make_maca" cd build -cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON -make_maca -j8 +cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON +make_maca -j60 echo "install whl" pip install dist/paddle_metax_gpu*.whl --force-reinstall diff --git a/backends/metax_gpu/change_patch.sh b/backends/metax_gpu/change_patch.sh index 833ae00f6bd..f29986a3780 100644 --- a/backends/metax_gpu/change_patch.sh +++ b/backends/metax_gpu/change_patch.sh @@ -21,7 +21,9 @@ unzip mcEigen_3.4.0_paddle_final.zip mv mcEigen_3.4.0_paddle_final eigen3 cd .. cp -r patch/eigen3/ ../../Paddle/third_party/eigen3 +rm -r patch/eigen3 cp patch/tmp/mixed_vector* ../../Paddle/paddle/phi/core cd ../../Paddle/ git apply --verbose ../backends/metax_gpu/patch/paddle.patch cd - +# cp -r patch/intrinsics.cuh ../../Paddle/third_party/warpctc/include/contrib/moderngpu/include/device/ diff --git a/backends/metax_gpu/cmake/warpctc.cmake b/backends/metax_gpu/cmake/warpctc.cmake new file mode 100644 index 00000000000..5d668032fb1 --- /dev/null +++ b/backends/metax_gpu/cmake/warpctc.cmake @@ -0,0 +1,163 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +include(ExternalProject) + +if(WITH_ROCM) + add_definitions(-DWARPCTC_WITH_HIP) +endif() + +set(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) +set(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) +# in case of low internet speed set(WARPCTC_REPOSITORY +# https://gitee.com/tianjianhe/warp-ctc.git) +set(WARPCTC_TAG bdc2b4550453e0ef2d3b5190f9c6103a84eff184) +set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/warpctc) +set(WARPCTC_PATCH_COMMAND "") +set(WARPCTC_CCBIN_OPTION "") +if(WIN32) + set(WARPCTC_PATCH_CUDA_COMMAND + git checkout -- . && git checkout ${WARPCTC_TAG} && git apply + ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch) +else() + set(WARPCTC_PATCH_CUDA_COMMAND + git checkout -- . && git checkout ${WARPCTC_TAG} && patch -Nd + ${SOURCE_DIR} < + ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.cuda.patch) + file(COPY ${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh + DESTINATION ${SOURCE_DIR}/include/contrib/moderngpu/include/device/) + message(STATUS "atch file path: ${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh") + message( + STATUS + "ModernGPU device path: ${SOURCE_DIR}/include/contrib/moderngpu/include/device/" + ) +endif() + +if(NOT WIN32 AND WITH_GPU) + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0 AND ${CMAKE_CXX_COMPILER_VERSION} + VERSION_GREATER 12.0) + file(TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.patch native_src) + set(WARPCTC_PATCH_COMMAND git checkout -- . && git checkout ${WARPCTC_TAG} + && patch -Nd ${SOURCE_DIR} < ${native_src} &&) + set(WARPCTC_CCBIN_OPTION -DCCBIN_COMPILER=${CCBIN_COMPILER}) + endif() +endif() + +if(WITH_ROCM) + set(WARPCTC_PATHCH_ROCM_COMMAND + patch -p1 < + ${PADDLE_SOURCE_DIR}/patches/warpctc/CMakeLists.txt.rocm.patch && patch + -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/devicetypes.cuh.patch && patch + -p1 < ${PADDLE_SOURCE_DIR}/patches/warpctc/hip.cmake.patch) +endif() + +set(WARPCTC_INCLUDE_DIR + "${WARPCTC_INSTALL_DIR}/include" + CACHE PATH "Warp-ctc Directory" FORCE) +# Used in unit test test_WarpCTCLayer +set(WARPCTC_LIB_DIR + "${WARPCTC_INSTALL_DIR}/lib" + CACHE PATH "Warp-ctc Library Directory" FORCE) + +if(WIN32) + set(WARPCTC_LIBRARIES + "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_STATIC_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +else() + set(WARPCTC_LIBRARIES + "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_STATIC_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +endif() + +if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" + OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" + OR WIN32) + set(USE_OMP OFF) +else() + set(USE_OMP ON) +endif() + +if(WIN32) + set(WARPCTC_C_FLAGS $) + set(WARPCTC_C_FLAGS_DEBUG $) + set(WARPCTC_C_FLAGS_RELEASE + $) + set(WARPCTC_CXX_FLAGS $) + set(WARPCTC_CXX_FLAGS_RELEASE + $) + set(WARPCTC_CXX_FLAGS_DEBUG + $) +else() + set(WARPCTC_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") + set(WARPCTC_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + set(WARPCTC_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") + set(WARPCTC_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) + set(WARPCTC_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) +endif() + +set(COPY_COMMAND + ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/patch/intrinsics.cuh" + "${SOURCE_DIR}/include/contrib/moderngpu/include/device/") + +ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} + SOURCE_DIR ${SOURCE_DIR} + PREFIX ${WARPCTC_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND + COMMAND ${WARPCTC_PATCH_COMMAND} + COMMAND ${WARPCTC_PATCH_CUDA_COMMAND} + COMMAND ${COPY_COMMAND} + COMMAND ${WARPCTC_PATHCH_ROCM_COMMAND} + # BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${WARPCTC_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${WARPCTC_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${WARPCTC_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${WARPCTC_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${WARPCTC_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${WARPCTC_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=OFF + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCUDA_TOOLKIT_ROOT_DIR=${CUDA_TOOLKIT_ROOT_DIR} + ${EXTERNAL_OPTIONAL_ARGS} + ${WARPCTC_CCBIN_OPTION} + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + BUILD_BYPRODUCTS ${WARPCTC_LIBRARIES}) + +message(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") +get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY) +include_directories(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its + # headers. + +add_library(warpctc STATIC IMPORTED GLOBAL) +set_target_properties( + warpctc PROPERTIES IMPORTED_LOCATION ${WARPCTC_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${WARPCTC_INCLUDE_DIR}) diff --git a/backends/metax_gpu/cmake/warprnnt.cmake b/backends/metax_gpu/cmake/warprnnt.cmake new file mode 100644 index 00000000000..a8d6683af2b --- /dev/null +++ b/backends/metax_gpu/cmake/warprnnt.cmake @@ -0,0 +1,143 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +include(ExternalProject) + +if(WITH_ROCM) + add_definitions(-DWARPRNNT_WITH_HIP) +endif() + +set(WARPRNNT_PREFIX_DIR ${THIRD_PARTY_PATH}/warprnnt) +set(WARPRNNT_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warprnnt) +set(WARPRNNT_TAG 7ea6bfe748779c245a0fcaa5dd9383826273eff2) +set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/warprnnt) +set(WARPRNNT_PATCH_COMMAND "") +set(WARPRNNT_CCBIN_OPTION "") +if(WIN32) + set(WARPCTC_PATCH_CUDA_COMMAND + ${CMAKE_COMMAND} -E copy_if_different + ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.cuda.patch + "/") +else() + set(WARPCTC_PATCH_CUDA_COMMAND + git checkout -- . && git checkout ${WARPRNNT_TAG} && patch -Nd + ${SOURCE_DIR} < + ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.cuda.patch) +endif() +if(WITH_ROCM) + set(WARPRNNT_PATCH_ROCM_COMMAND + patch -p1 < + ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.rocm.patch) +endif() +if(NOT WIN32 AND WITH_GPU) + if(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0 AND ${CMAKE_CXX_COMPILER_VERSION} + VERSION_GREATER 12.0) + file(TO_NATIVE_PATH + ${PADDLE_SOURCE_DIR}/patches/warprnnt/CMakeLists.txt.patch native_src) + set(WARPRNNT_PATCH_COMMAND + git checkout -- . && git checkout ${WARPRNNT_TAG} && patch -Nd + ${SOURCE_DIR} < ${native_src}) + set(WARPRNNT_CCBIN_OPTION -DCCBIN_COMPILER=${CCBIN_COMPILER}) + endif() +endif() + +set(WARPRNNT_INCLUDE_DIR + "${WARPRNNT_INSTALL_DIR}/include" + CACHE PATH "Warp-rnnt Directory" FORCE) +# Used in unit test test_WarpCTCLayer +set(WARPRNNT_LIB_DIR + "${WARPRNNT_INSTALL_DIR}/lib" + CACHE PATH "Warp-rnnt Library Directory" FORCE) + +if(WIN32) + set(WARPRNNT_LIBRARIES + "${WARPRNNT_INSTALL_DIR}/bin/warprnnt${CMAKE_STATIC_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-rnnt Library" FORCE) +else() + set(WARPRNNT_LIBRARIES + "${WARPRNNT_INSTALL_DIR}/lib/libwarprnnt${CMAKE_STATIC_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-rnnt Library" FORCE) +endif() + +if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" + OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" + OR WIN32) + set(USE_OMP OFF) +else() + set(USE_OMP ON) +endif() + +if(WIN32) + set(WARPRNNT_C_FLAGS $) + set(WARPRNNT_C_FLAGS_DEBUG + $) + set(WARPRNNT_C_FLAGS_RELEASE + $) + set(WARPRNNT_CXX_FLAGS $) + set(WARPRNNT_CXX_FLAGS_RELEASE + $) + set(WARPRNNT_CXX_FLAGS_DEBUG + $) +else() + set(WARPRNNT_C_FLAGS "${CMAKE_C_FLAGS} -fPIC") + set(WARPRNNT_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + set(WARPRNNT_C_FLAGS_RELEASE ${CMAKE_C_FLAGS_RELEASE}) + set(WARPRNNT_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") + set(WARPRNNT_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE}) + set(WARPRNNT_CXX_FLAGS_DEBUG ${CMAKE_CXX_FLAGS_DEBUG}) +endif() +ExternalProject_Add( + extern_warprnnt + ${EXTERNAL_PROJECT_LOG_ARGS} + SOURCE_DIR ${SOURCE_DIR} + PREFIX ${WARPRNNT_PREFIX_DIR} + UPDATE_COMMAND "" + PATCH_COMMAND + COMMAND ${WARPCTC_PATCH_CUDA_COMMAND} + COMMAND ${WARPRNNT_PATCH_ROCM_COMMAND} + # BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${WARPRNNT_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${WARPRNNT_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${WARPRNNT_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${WARPRNNT_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${WARPRNNT_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${WARPRNNT_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPRNNT_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DNVCC_FLAGS_EXTRA=${NVCC_FLAGS_EXTRA} + -DBUILD_SHARED=OFF + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + ${WARPCTC_CCBIN_OPTION} + CMAKE_CACHE_ARGS + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPRNNT_INSTALL_DIR} + BUILD_BYPRODUCTS ${WARPRNNT_LIBRARIES}) + +message(STATUS "warp-rnnt library: ${WARPRNNT_LIBRARIES}") +get_filename_component(WARPRNNT_LIBRARY_PATH ${WARPRNNT_LIBRARIES} DIRECTORY) +include_directories(${WARPRNNT_INCLUDE_DIR}) # For warprnnt code to include its + # headers. + +add_library(warprnnt STATIC IMPORTED GLOBAL) +set_target_properties( + warprnnt PROPERTIES IMPORTED_LOCATION ${WARPRNNT_LIBRARIES} + INTERFACE_INCLUDE_DIRECTORIES ${WARPRNNT_INCLUDE_DIR}) diff --git a/backends/metax_gpu/compile.sh b/backends/metax_gpu/compile.sh index e9860ccb7d0..eba45a9ced2 100644 --- a/backends/metax_gpu/compile.sh +++ b/backends/metax_gpu/compile.sh @@ -30,7 +30,7 @@ fi echo "make_maca" cd build -cmake_maca .. -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON +cmake_maca .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DPython3_EXECUTABLE=$(which python3) -DWITH_GPU=ON make_maca -j10 diff --git a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu index 11def2c9ee4..2aa8424f0b1 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/calc_reduced_attn_kernel_register.cu @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/flash_attn_utils.h" +#include "kernels/metax_kernel/flash_attn_utils.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/calc_reduced_attn_kernel.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu index c8d69cecae1..f935014d17b 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_grad_kernel_register.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu" //NOLINT #include "paddle/phi/kernels/index_elementwise_put_grad_kernel.h" - PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_grad, metax_gpu, ALL_LAYOUT, @@ -31,3 +31,19 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_grad, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} +PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_with_tensor_grad, + metax_gpu, + ALL_LAYOUT, + phi::IndexElementwisePutWithTensorGradKernel, + bool, + float, + double, + int, + int8_t, + int64_t, + int16_t, + uint8_t, + phi::float16, + phi::bfloat16, + phi::complex64, + phi::complex128) {} diff --git a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu index 391dd908a8d..533204b8102 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/index_elementwise_put_kernel_register.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu" //NOLINT #include "paddle/phi/kernels/index_elementwise_put_kernel.h" - PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put, metax_gpu, ALL_LAYOUT, @@ -31,3 +31,19 @@ PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} +PD_CUSTOM_KERNEL_REGISTER(index_elementwise_put_with_tensor, + metax_gpu, + ALL_LAYOUT, + phi::IndexElementwisePutWithTensorKernel, + bool, + float, + double, + int, + int8_t, + int64_t, + int16_t, + uint8_t, + phi::float16, + phi::bfloat16, + phi::complex64, + phi::complex128) {} diff --git a/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu index e77a29d12fe..d02f805a671 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/warpctc_grad_kernel_register.cu @@ -17,7 +17,7 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/warpctc_grad_kernel.h" -PD_REGISTER_PLUGIN_KERNEL(warpctc_grad, +PD_CUSTOM_KERNEL_REGISTER(warpctc_grad, metax_gpu, ALL_LAYOUT, phi::WarpctcGradKernel, diff --git a/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu b/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu index 5b343506cad..c488e23fba9 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu +++ b/backends/metax_gpu/kernels/cuda_kernels/warpctc_kernel_register.cu @@ -17,5 +17,5 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/warpctc_kernel.h" -PD_REGISTER_PLUGIN_KERNEL( +PD_CUSTOM_KERNEL_REGISTER( warpctc, metax_gpu, ALL_LAYOUT, phi::WarpctcKernel, float, double) {} diff --git a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h old mode 100755 new mode 100644 index 419387cc9c4..ae4baa52613 --- a/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h +++ b/backends/metax_gpu/kernels/funcs/blas/blas_impl.cu.h @@ -34,70 +34,6 @@ PHI_DECLARE_bool(gemm_use_half_precision_compute_type); namespace phi { namespace funcs { - -inline static cublasHandle_t blas_handle_ = nullptr; -inline static cublasHandle_t blas_tensor_core_handle_ = nullptr; -inline static cublasHandle_t blas_tf32_tensor_core_handle_ = nullptr; - -inline std::once_flag flag_sparse_; -inline std::once_flag flag_blas_; -inline std::once_flag flag_blaslt_; -inline std::once_flag flag_dnn_; -inline std::once_flag flag_solver_; -inline std::once_flag flag_cublas_; -inline std::once_flag flag_tensorcore_cublas_; -inline std::once_flag flag_eigen_device_; - -inline std::mutex blas_mtx_; -inline std::mutex blas_tensor_core_mtx_; -inline std::mutex blas_tf32_mtx_; -inline std::mutex sparse_mtx_; -inline std::mutex stream_call_back_mtx_; - -inline void InitBlasHandle(cublasHandle_t *blas_handle, gpuStream_t stream) { - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle)); - PADDLE_RETRY_CUDA_SUCCESS( - phi::dynload::cublasSetStream(*blas_handle, stream)); -} - -inline void CublasCall(const std::function &callback, - phi::stream::stream_t stream) { - std::call_once(flag_cublas_, [&]() { - if (!blas_handle_) InitBlasHandle(&blas_handle_, stream); - if (!blas_tensor_core_handle_) { - InitBlasHandle(&blas_tensor_core_handle_, stream); - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); - } - }); - std::lock_guard guard(blas_mtx_); - callback(blas_handle_); -} - -inline bool MetaxTensorCoreAvailable() { - return blas_tensor_core_handle_ != nullptr; -} - -inline void TensorCoreCublasCallIfAvailable( - const std::function &callback, - phi::stream::stream_t stream) { - std::call_once(flag_tensorcore_cublas_, [&]() { - if (!blas_handle_) InitBlasHandle(&blas_handle_, stream); - if (!blas_tensor_core_handle_) { - InitBlasHandle(&blas_tensor_core_handle_, stream); - PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode( - blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH)); - } - }); - if (blas_tensor_core_handle_ != nullptr) { - std::lock_guard guard(blas_tensor_core_mtx_); - callback(blas_tensor_core_handle_); - } else { - std::lock_guard guard(blas_mtx_); - callback(blas_handle_); - } -} - template struct CUBlas; @@ -174,28 +110,26 @@ struct CUBlas { // here. #if CUDA_VERSION >= 8000 VLOG(5) << "use_tensor_op_math: " - << (MetaxTensorCoreAvailable() ? "True" : "False"); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc)); - }, - dev_ctx->stream()); + << (dev_ctx->tensor_core_available() ? "True" : "False"); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasSgemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc)); + }); #else PADDLE_THROW(phi::errors::Unimplemented( "cublasSgemmEx is not supported on cuda <= 7.5")); @@ -376,7 +310,7 @@ struct CUBlas { #if CUDA_VERSION >= 8000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx->tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -386,31 +320,29 @@ struct CUBlas { thrust::device_vector A_ptr(A, A + batchCount); thrust::device_vector B_ptr(B, B + batchCount); thrust::device_vector C_ptr(C, C + batchCount); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmBatchedEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A_ptr.data().get(), - Atype, - lda, - B_ptr.data().get(), - Btype, - ldb, - beta, - C_ptr.data().get(), - Ctype, - ldc, - batchCount, - computeType, - algo)); - }, - dev_ctx->stream()); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmBatchedEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A_ptr.data().get(), + Atype, + lda, + B_ptr.data().get(), + Btype, + ldb, + beta, + C_ptr.data().get(), + Ctype, + ldc, + batchCount, + computeType, + algo)); + }); #else PADDLE_THROW(phi::errors::Unimplemented( "cublasGemmBatchedEx is not supported on cuda <= 7.5")); @@ -486,7 +418,7 @@ struct CUBlas { #if CUDA_VERSION >= 8000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx->tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -494,29 +426,27 @@ struct CUBlas { << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }, - dev_ctx->stream()); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + computeType, + algo)); + }); #else PADDLE_THROW(phi::errors::Unimplemented( "cublasGemmEx is not supported on cuda <= 7.5")); @@ -696,7 +626,7 @@ struct CUBlas> { #if CUDA_VERSION >= 8000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx->tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -704,29 +634,27 @@ struct CUBlas> { << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }, - dev_ctx->stream()); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + computeType, + algo)); + }); #else PADDLE_THROW(phi::errors::Unimplemented( "cublasGemmEx is not supported on cuda <= 7.5")); @@ -1024,7 +952,7 @@ struct CUBlas> { #if CUDA_VERSION >= 8000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx->tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -1032,29 +960,27 @@ struct CUBlas> { << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - computeType, - algo)); - }, - dev_ctx->stream()); + dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + computeType, + algo)); + }); #else PADDLE_THROW(phi::errors::Unimplemented( "cublasGemmEx is not supported on cuda <= 7.5")); @@ -1186,24 +1112,22 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, PADDLE_THROW(common::errors::Unimplemented( "GEMM_EX_64 is not supported on cuda < 12.3")); } else { - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - N); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + A, + lda, + &beta, + C, + N); + }); } #if CUDA_VERSION >= 8000 @@ -1271,24 +1195,22 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - h_B, - ldb, - h_A, - lda, - &h_beta, - h_C, - N); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + h_B, + ldb, + h_A, + lda, + &h_beta, + h_C, + N); + }); #endif // CUDA_VERSION >= 8000 } @@ -1352,24 +1274,22 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, PADDLE_THROW(common::errors::Unimplemented( "GEMM_EX_64 is not supported on cuda < 12.3")); } else { - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &t_alpha, - B, - static_cast(ldb), - A, - static_cast(lda), - &t_beta, - C, - static_cast(N)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &t_alpha, + B, + static_cast(ldb), + A, + static_cast(lda), + &t_beta, + C, + static_cast(N)); + }); } #if CUDA_VERSION >= 8000 @@ -1447,24 +1367,22 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, CUBLAS_COMPUTE_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &h_alpha, - h_B, - static_cast(ldb), - h_A, - static_cast(lda), - &h_beta, - h_C, - static_cast(N)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &h_alpha, + h_B, + static_cast(ldb), + h_A, + static_cast(lda), + &h_beta, + h_C, + static_cast(N)); + }); #endif // CUDA_VERSION >= 8000 } } @@ -1503,7 +1421,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, float h_beta = static_cast(beta); cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -1519,30 +1437,27 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 12030 } else { CheckGEMMNSize(N); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmEx(handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - CUDA_R_16BF, - ldb, - A, - CUDA_R_16BF, - lda, - &h_beta, - C, - CUDA_R_16BF, - N, - CUBLAS_COMPUTE_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + CUDA_R_16BF, + ldb, + A, + CUDA_R_16BF, + lda, + &h_beta, + C, + CUDA_R_16BF, + N, + CUBLAS_COMPUTE_32F, + algo)); + }); } #else // raise error @@ -1621,24 +1536,22 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - CublasCall( - [&](cublasHandle_t handle) { - CUBlas>::GEMM(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &c_alpha, - h_B, - static_cast(ldb), - h_A, - static_cast(lda), - &c_beta, - h_C, - static_cast(N)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas>::GEMM(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &c_alpha, + h_B, + static_cast(ldb), + h_A, + static_cast(lda), + &c_beta, + h_C, + static_cast(N)); + }); #endif // CUDA_VERSION >= 8000 } } @@ -1713,24 +1626,22 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - CublasCall( - [&](cublasHandle_t handle) { - CUBlas>::GEMM(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &c_alpha, - h_B, - static_cast(ldb), - h_A, - static_cast(lda), - &c_beta, - h_C, - static_cast(N)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas>::GEMM(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &c_alpha, + h_B, + static_cast(ldb), + h_A, + static_cast(lda), + &c_beta, + h_C, + static_cast(N)); + }); #endif // CUDA_VERSION >= 8000 } } @@ -1769,7 +1680,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, float h_beta = beta; cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -1784,30 +1695,28 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 12030 } else { CheckGEMMNSize(N); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmEx(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &h_alpha, - B, - CUDA_R_16BF, - static_cast(ldb), - A, - CUDA_R_16BF, - static_cast(lda), - &h_beta, - C, - CUDA_R_16BF, - static_cast(N), - CUDA_R_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmEx(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &h_alpha, + B, + CUDA_R_16BF, + static_cast(ldb), + A, + CUDA_R_16BF, + static_cast(lda), + &h_beta, + C, + CUDA_R_16BF, + static_cast(N), + CUDA_R_32F, + algo)); + }); } #else // raise error @@ -1860,24 +1769,22 @@ void Blas::GEMM(bool transA, } else { #endif // CUDA_VERSION >= 8000 - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - ldc); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + A, + lda, + &beta, + C, + ldc); + }); #if CUDA_VERSION >= 8000 } @@ -1904,24 +1811,22 @@ inline void Blas::GEMM(bool transA, cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - A, - lda, - &beta, - C, - ldc); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + A, + lda, + &beta, + C, + ldc); + }); } template <> @@ -1957,36 +1862,33 @@ inline void Blas::GEMM(bool transA, float h_beta = static_cast(beta); cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmEx(handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - CUDA_R_16BF, - ldb, - A, - CUDA_R_16BF, - lda, - &h_beta, - C, - CUDA_R_16BF, - ldc, - CUBLAS_COMPUTE_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + CUDA_R_16BF, + ldb, + A, + CUDA_R_16BF, + lda, + &h_beta, + C, + CUDA_R_16BF, + ldc, + CUBLAS_COMPUTE_32F, + algo)); + }); #else // raise error PADDLE_THROW(phi::errors::Unimplemented( @@ -1998,27 +1900,23 @@ inline void Blas::GEMM(bool transA, template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); + }); } template <> template void Blas::SCAL(int n, const T alpha, T *x) const { - CublasCall( - [&](cublasHandle_t handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }, - dev_ctx_.stream()); + dev_ctx_.CublasCall( + [&](cublasHandle_t handle) { CUBlas::SCAL(handle, n, &alpha, x, 1); }); } template <> template void Blas::VCOPY(int n, const T *x, T *y) const { - CublasCall( - [&](cublasHandle_t handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }, - dev_ctx_.stream()); + dev_ctx_.CublasCall( + [&](cublasHandle_t handle) { CUBlas::VCOPY(handle, n, x, 1, y, 1); }); } template <> @@ -2033,12 +1931,9 @@ void Blas::GEMV(bool trans_a, T *C) const { cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMV( - handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); + }); } template <> @@ -2112,7 +2007,7 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same::value)) || std::is_same::value) { cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -2153,60 +2048,56 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3")); #endif // CUDA_VERSION >= 12030 } else { - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmStridedBatchedEx(handle, - cuTransB, - cuTransA, - N, - M, - K, - a, - B, - fp, - ldb, - strideB, - A, - fp, - lda, - strideA, - b, - C, - fp, - ldc, - strideC, - batchCount, - compute_type, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmStridedBatchedEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + a, + B, + fp, + ldb, + strideB, + A, + fp, + lda, + strideA, + b, + C, + fp, + ldc, + strideC, + batchCount, + compute_type, + algo)); + }); } } else { #endif // CUDA_VERSION >= 9010 - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM_STRIDED_BATCH(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &alpha, - B, - static_cast(ldb), - strideB, - A, - static_cast(lda), - strideA, - &beta, - C, - ldc, - strideC, - static_cast(batchCount)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_STRIDED_BATCH(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &alpha, + B, + static_cast(ldb), + strideB, + A, + static_cast(lda), + strideA, + &beta, + C, + ldc, + strideC, + static_cast(batchCount)); + }); #if CUDA_VERSION >= 9010 } @@ -2242,7 +2133,7 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, if ((FLAGS_enable_cublas_tensor_op_math && (std::is_same::value)) || std::is_same::value) { cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -2284,61 +2175,57 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3")); #endif // CUDA_VERSION >= 12030 } else { - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx( - handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - a, - B, - fp, - static_cast(ldb), - strideB, - A, - fp, - static_cast(lda), - strideA, - b, - C, - fp, - static_cast(ldc), - strideC, - static_cast(batchCount), - compute_type, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx( + handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + a, + B, + fp, + static_cast(ldb), + strideB, + A, + fp, + static_cast(lda), + strideA, + b, + C, + fp, + static_cast(ldc), + strideC, + static_cast(batchCount), + compute_type, + algo)); + }); } } else { #endif // CUDA_VERSION >= 9010 T h_alpha = static_cast(alpha); T h_beta = static_cast(beta); - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM_STRIDED_BATCH(handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &h_alpha, - B, - static_cast(ldb), - strideB, - A, - static_cast(lda), - strideA, - &h_beta, - C, - static_cast(ldc), - strideC, - static_cast(batchCount)); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_STRIDED_BATCH(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &h_alpha, + B, + static_cast(ldb), + strideB, + A, + static_cast(lda), + strideA, + &h_beta, + C, + static_cast(ldc), + strideC, + static_cast(batchCount)); + }); #if CUDA_VERSION >= 9010 } @@ -2377,7 +2264,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, float h_beta = static_cast(beta); cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -2392,34 +2279,32 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3")); #endif // CUDA_VERSION >= 12030 } else { - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx( - handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &h_alpha, - B, - CUDA_R_16BF, - static_cast(ldb), - strideB, - A, - CUDA_R_16BF, - static_cast(lda), - strideA, - &h_beta, - C, - CUDA_R_16BF, - static_cast(ldc), - strideC, - static_cast(batchCount), - CUBLAS_COMPUTE_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmStridedBatchedEx(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &h_alpha, + B, + CUDA_R_16BF, + static_cast(ldb), + strideB, + A, + CUDA_R_16BF, + static_cast(lda), + strideA, + &h_beta, + C, + CUDA_R_16BF, + static_cast(ldc), + strideC, + static_cast(batchCount), + CUBLAS_COMPUTE_32F, + algo)); + }); } #else // raise error @@ -2460,7 +2345,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, float h_beta = beta; cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -2475,34 +2360,32 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, "cublasGemmStridedBatchedEx_64 is not supported on cuda < 12.3")); #endif // CUDA_VERSION >= 12030 } else { - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmStridedBatchedEx( - handle, - cuTransB, - cuTransA, - static_cast(N), - static_cast(M), - static_cast(K), - &h_alpha, - B, - CUDA_R_16BF, - static_cast(ldb), - strideB, - A, - CUDA_R_16BF, - static_cast(lda), - strideA, - &h_beta, - C, - CUDA_R_16BF, - static_cast(ldc), - strideC, - static_cast(batchCount), - CUBLAS_COMPUTE_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmStridedBatchedEx(handle, + cuTransB, + cuTransA, + static_cast(N), + static_cast(M), + static_cast(K), + &h_alpha, + B, + CUDA_R_16BF, + static_cast(ldb), + strideB, + A, + CUDA_R_16BF, + static_cast(lda), + strideA, + &h_beta, + C, + CUDA_R_16BF, + static_cast(ldc), + strideC, + static_cast(batchCount), + CUBLAS_COMPUTE_32F, + algo)); + }); } #else // raise error @@ -2547,7 +2430,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // (std::is_same::value)) || // std::is_same::value) { // cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; -// bool use_tensor_op_math = MetaxTensorCoreAvailable(); +// bool use_tensor_op_math = dev_ctx_.tensor_core_available(); // if (use_tensor_op_math) { // algo = CUBLAS_GEMM_DFALT_TENSOR_OP; // } @@ -2579,7 +2462,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // #endif // } -// TensorCoreCublasCallIfAvailable( +// dev_ctx_.TensorCoreCublasCallIfAvailable( // [&](cublasHandle_t handle) { // PADDLE_ENFORCE_GPU_SUCCESS( // phi::dynload::cublasGemmStridedBatchedEx(handle, @@ -2605,12 +2488,11 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // batchCount, // compute_type, // algo)); -// }, -// dev_ctx_.stream()); +// }); // } else { // #endif // CUDA_VERSION >= 9010 -// CublasCall( +// dev_ctx_.CublasCall( // [&](cublasHandle_t handle) { // CUBlas::GEMM_STRIDED_BATCH(handle, // cuTransB, @@ -2667,7 +2549,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // cublasOperation_t cuTransB = // (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; // const int64_t strideC = M * N; -// CublasCall( +// dev_ctx_.CublasCall( // [&](cublasHandle_t handle) { // PADDLE_ENFORCE_GPU_SUCCESS( // phi::dynload::cublasDgemmStridedBatched(handle, @@ -2723,14 +2605,14 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // float h_beta = static_cast(beta); // cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; -// bool use_tensor_op_math = MetaxTensorCoreAvailable(); +// bool use_tensor_op_math = dev_ctx->tensor_core_available(); // if (use_tensor_op_math) { // algo = CUBLAS_GEMM_DFALT_TENSOR_OP; // } // VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : // "False"); -// TensorCoreCublasCallIfAvailable( +// dev_ctx_.TensorCoreCublasCallIfAvailable( // [&](cublasHandle_t handle) { // PADDLE_ENFORCE_GPU_SUCCESS( // phi::dynload::cublasGemmStridedBatchedEx(handle, @@ -2756,8 +2638,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, // batchCount, // CUBLAS_COMPUTE_32F, // algo)); -// }, -// dev_ctx_.stream()); +// }); // #else // // raise error // PADDLE_THROW(phi::errors::Unimplemented( @@ -2812,25 +2693,23 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, thrust::device_vector B_ptr(B, B + batchCount); thrust::device_vector C_ptr(C, C + batchCount); - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM_BATCH(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B_ptr.data().get(), - ldb, - A_ptr.data().get(), - lda, - &beta, - C_ptr.data().get(), - ldc, - batchCount); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_BATCH(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B_ptr.data().get(), + ldb, + A_ptr.data().get(), + lda, + &beta, + C_ptr.data().get(), + ldc, + batchCount); + }); } template <> @@ -2859,25 +2738,23 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, thrust::device_vector B_ptr(B, B + batchCount); thrust::device_vector C_ptr(C, C + batchCount); - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GEMM_BATCH(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B_ptr.data().get(), - ldb, - A_ptr.data().get(), - lda, - &beta, - C_ptr.data().get(), - ldc, - batchCount); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GEMM_BATCH(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B_ptr.data().get(), + ldb, + A_ptr.data().get(), + lda, + &beta, + C_ptr.data().get(), + ldc, + batchCount); + }); } template <> @@ -2970,7 +2847,7 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, float f_beta = static_cast(beta); cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; - bool use_tensor_op_math = MetaxTensorCoreAvailable(); + bool use_tensor_op_math = dev_ctx_.tensor_core_available(); if (use_tensor_op_math) { algo = CUBLAS_GEMM_DFALT_TENSOR_OP; } @@ -2979,31 +2856,29 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, thrust::device_vector A_ptr(A, A + batchCount); thrust::device_vector B_ptr(B, B + batchCount); thrust::device_vector C_ptr(C, C + batchCount); - TensorCoreCublasCallIfAvailable( - [&](cublasHandle_t handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - phi::dynload::cublasGemmBatchedEx(handle, - cuTransB, - cuTransA, - N, - M, - K, - &f_alpha, - B_ptr.data().get(), - CUDA_R_16BF, - ldb, - A_ptr.data().get(), - CUDA_R_16BF, - lda, - &f_beta, - C_ptr.data().get(), - CUDA_R_16BF, - ldc, - batchCount, - CUBLAS_COMPUTE_32F, - algo)); - }, - dev_ctx_.stream()); + dev_ctx_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::cublasGemmBatchedEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + &f_alpha, + B_ptr.data().get(), + CUDA_R_16BF, + ldb, + A_ptr.data().get(), + CUDA_R_16BF, + lda, + &f_beta, + C_ptr.data().get(), + CUDA_R_16BF, + ldc, + batchCount, + CUBLAS_COMPUTE_32F, + algo)); + }); #else // raise error PADDLE_THROW(phi::errors::Unimplemented( @@ -3038,33 +2913,19 @@ void Blas::TRSM(CBLAS_SIDE side, cublasDiagType_t cuDiag = (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::TRSM(handle, - cuSide, - cuUplo, - cuTransA, - cuDiag, - N, - M, - &alpha, - A, - lda, - B, - ldb); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::TRSM( + handle, cuSide, cuUplo, cuTransA, cuDiag, N, M, &alpha, A, lda, B, ldb); + }); } template <> template void Blas::BatchedGETRF( int n, T **a, int *ipiv, int *info, int batch_size) const { - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GETRF_BATCH(handle, n, a, n, ipiv, info, batch_size); + }); } template <> @@ -3084,23 +2945,18 @@ void Blas::BatchedGETRI(int n, "overlap memory space of input matrix (address: %p).", a_inv, a)); - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GETRI_BATCH( - handle, n, a, n, ipiv, a_inv, n, info, batch_size); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GETRI_BATCH(handle, n, a, n, ipiv, a_inv, n, info, batch_size); + }); } template <> template void Blas::BatchedMatInv( int n, const T **a, T **a_inv, int *info, int batch_size) const { - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::MATINV_BATCH(handle, n, a, n, a_inv, n, info, batch_size); + }); } template <> @@ -3118,12 +2974,10 @@ void Blas::BatchedGETRS(CBLAS_TRANSPOSE trans, // use CUBLAS_OP_C (conjugate transpose) for complex cublasOperation_t cuTrans = (trans == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::GETRS_BATCH( - handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::GETRS_BATCH( + handle, cuTrans, n, nrhs, a, lda, ipiv, b, ldb, info, batch_size); + }); } template <> @@ -3152,23 +3006,21 @@ void Blas::BatchedTRSM(CBLAS_SIDE side, cublasDiagType_t cuDiag = (diag == CblasUnit) ? CUBLAS_DIAG_UNIT : CUBLAS_DIAG_NON_UNIT; - CublasCall( - [&](cublasHandle_t handle) { - CUBlas::TRSM_BATCH(handle, - cuSide, - cuUplo, - cuTransA, - cuDiag, - N, - M, - &alpha, - A, - lda, - B, - ldb, - batch_size); - }, - dev_ctx_.stream()); + dev_ctx_.CublasCall([&](cublasHandle_t handle) { + CUBlas::TRSM_BATCH(handle, + cuSide, + cuUplo, + cuTransA, + cuDiag, + N, + M, + &alpha, + A, + lda, + B, + ldb, + batch_size); + }); } } // namespace funcs diff --git a/backends/metax_gpu/kernels/funcs/softmax.cu b/backends/metax_gpu/kernels/funcs/softmax.cu index d738a53f43a..44bfd02a308 100644 --- a/backends/metax_gpu/kernels/funcs/softmax.cu +++ b/backends/metax_gpu/kernels/funcs/softmax.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h index ec429950872..8c5996e680b 100644 --- a/backends/metax_gpu/kernels/funcs/values_vectors_functor.h +++ b/backends/metax_gpu/kernels/funcs/values_vectors_functor.h @@ -24,7 +24,7 @@ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/common/errors.h" #endif -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_adam_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_adam_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_adam_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_adam_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_bias_dropout_residual_layer_norm_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_bias_dropout_residual_layer_norm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_embedding_eltwise_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_embedding_eltwise_layernorm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_embedding_eltwise_layernorm_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_embedding_eltwise_layernorm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_layernorm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_layernorm_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_layernorm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_seqpool_cvm_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_seqpool_cvm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_grad_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_grad_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_upper_triangle_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_softmax_mask_upper_triangle_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_softmax_mask_upper_triangle_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_softmax_mask_upper_triangle_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_stack_transpose_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_stack_transpose_quant_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_stack_transpose_quant_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_stack_transpose_quant_kernel_register.cu diff --git a/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu new file mode 100644 index 00000000000..08876233bfb --- /dev/null +++ b/backends/metax_gpu/kernels/fusion/fused_swiglu_weighted_bwd_kernel_register.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/fusion/gpu/fused_swiglu_weighted_bwd_kernel.cu" //NOLINT + +PD_CUSTOM_KERNEL_REGISTER(fused_swiglu_weighted_bwd, + metax_gpu, + ALL_LAYOUT, + phi::FusedSwigluWeightedBwdKernel, + float, + double, + int, + int64_t, + phi::bfloat16) { + kernel->OutputAt(0).SetDataType(phi::DataType::BFLOAT16); + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::BFLOAT16); +} diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_token_prune_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_token_prune_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_token_prune_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_token_prune_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_transpose_split_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_transpose_split_quant_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_transpose_split_quant_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_transpose_split_quant_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_transpose_wlch_split_quant_kernel_register.cu b/backends/metax_gpu/kernels/fusion/fused_transpose_wlch_split_quant_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_transpose_wlch_split_quant_kernel_register.cu rename to backends/metax_gpu/kernels/fusion/fused_transpose_wlch_split_quant_kernel_register.cu diff --git a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h index da61a1e5b41..a0f89047045 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h +++ b/backends/metax_gpu/kernels/gpudnn/conv_cudnn_v7.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "glog/logging.h" -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/cuda/cuda_graph_with_memory_pool.h" #include "paddle/phi/kernels/autotune/switch_autotune.h" #include "paddle/phi/kernels/gpudnn/conv_gpudnn_base.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu similarity index 98% rename from backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu rename to backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu index 885137675b4..e4acb2f95b6 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/conv_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_grad_kernel_register.cu @@ -161,7 +161,12 @@ void ConvCudnnGradKernelImplV7( args1.idesc.set(*transformed_input_grad, layout_tensor); args1.wdesc.set(*transformed_filter_channel, layout_tensor, iwo_groups); args1.odesc.set(*transformed_output_grad_channel, layout_tensor); - args1.cdesc.set(dtype, padding_common, strides, dilations, true, c_groups); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_groups); #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; @@ -184,7 +189,12 @@ void ConvCudnnGradKernelImplV7( args2.wdesc.set( *transformed_filter_grad_channel, layout_tensor, iwo_groups); args2.odesc.set(*transformed_output_grad_channel, layout_tensor); - args2.cdesc.set(dtype, padding_common, strides, dilations, true, c_groups); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_groups); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2)); @@ -1073,7 +1083,12 @@ void ConvCudnnGradGradKernel( args1.idesc.set(transformed_ddX, iwo_group); args1.wdesc.set(*W, layout, iwo_group); args1.odesc.set(transformed_ddO_channel, iwo_group); - args1.cdesc.set(dtype, padding_common, strides, dilations, true, c_group); + args1.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_group); #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; @@ -1092,7 +1107,12 @@ void ConvCudnnGradGradKernel( args2.idesc.set(transformed_X, iwo_group); args2.wdesc.set(*ddW, layout, iwo_group); args2.odesc.set(transformed_ddO_channel, iwo_group); - args2.cdesc.set(dtype, padding_common, strides, dilations, true, c_group); + args2.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_group); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; @@ -1114,7 +1134,12 @@ void ConvCudnnGradGradKernel( args3.idesc.set(transformed_ddX, iwo_group); args3.wdesc.set(*dW, layout, iwo_group); args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, true, c_group); + args3.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_group); #ifdef PADDLE_WITH_HIP using search3 = SearchAlgorithm; @@ -1136,7 +1161,12 @@ void ConvCudnnGradGradKernel( args4.idesc.set(transformed_dX, iwo_group); args4.wdesc.set(*ddW, layout, iwo_group); args4.odesc.set(transformed_dO_channel, iwo_group); - args4.cdesc.set(dtype, padding_common, strides, dilations, true, c_group); + args4.cdesc.set(dtype, + padding_common, + strides, + dilations, + phi::AllowTF32Cudnn(), + c_group); #ifdef PADDLE_WITH_HIP using search4 = SearchAlgorithm; diff --git a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu index bdff5fa9f93..0a83b504c76 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_kernel_register.cu @@ -81,7 +81,8 @@ void ConvCudnnKernelImplV7(const DenseTensor* transformed_input, args.cdesc.set( dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn(), groups); #else - args.cdesc.set(dtype, padding_common, strides, dilations, true); + args.cdesc.set( + dtype, padding_common, strides, dilations, phi::AllowTF32Cudnn()); #endif #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu index 0067818d165..b7eebfcee2e 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_grad_kernel_register.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "kernels/gpudnn/conv_cudnn_v7.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/ddim.h" #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/backends/dynload/cudnn.h" diff --git a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu index aa1cc80d06d..532b7af0db4 100644 --- a/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu +++ b/backends/metax_gpu/kernels/gpudnn/conv_transpose_kernel.cu @@ -93,7 +93,12 @@ void ConvTransposeCudnnKernelImplV7(const DenseTensor* transformed_x, args.idesc.set(*transformed_out, iwo_groups); args.wdesc.set(*filter, layout_tensor, iwo_groups); args.odesc.set(*transformed_x, iwo_groups); - args.cdesc.set(dtype, padding_common, strides, dilations_, false, c_groups); + args.cdesc.set(dtype, + padding_common, + strides, + dilations_, + phi::AllowTF32Cudnn(), + c_groups); #ifdef PADDLE_WITH_HIP SearchResult bwd_result; diff --git a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu index c115f5ad930..1c2bfeedf34 100644 --- a/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu +++ b/backends/metax_gpu/kernels/gpudnn/pool_kernel_register.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "gpudnn/pool_gpudnn.h" -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/full_kernel.h" diff --git a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h index 168752700e9..5844886ad1b 100644 --- a/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h +++ b/backends/metax_gpu/kernels/gpudnn/softmax_gpudnn.h @@ -25,7 +25,7 @@ #include "paddle/phi/kernels/primitive/kernel_primitives.h" // See Note [ Why still include the fluid headers? ] -#include "metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" diff --git a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h index 70af87513e5..c2e2e341bf5 100644 --- a/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/dirichlet_kernel_impl.h @@ -17,7 +17,7 @@ #include #include -#include "kernels/elementwise.h" +#include "kernels/metax_kernel/elementwise.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/amp_type_traits.h" diff --git a/backends/metax_gpu/kernels/impl/warpctc.h b/backends/metax_gpu/kernels/impl/warpctc.h deleted file mode 100644 index ba5da472ade..00000000000 --- a/backends/metax_gpu/kernels/impl/warpctc.h +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // NOLINT - -#include "paddle/phi/backends/dynload/dynamic_loader.h" -#include "paddle/phi/common/port.h" -#include "third_party/warpctc/include/ctc.h" - -namespace phi { -namespace dynload { - -extern std::once_flag warpctc_dso_flag; -extern void* warpctc_dso_handle; - -/** - * The following macro definition can generate structs - * (for each function) to dynamic load warpctc routine - * via operator overloading. - */ -#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using warpctcFunc = decltype(&::__name); \ - std::call_once(warpctc_dso_flag, []() { \ - warpctc_dso_handle = phi::dynload::GetWarpCTCDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(warpctc_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern DynLoad__##__name __name - -#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \ - DYNAMIC_LOAD_WARPCTC_WRAP(__name) - -#define WARPCTC_ROUTINE_EACH(__macro) \ - __macro(get_warpctc_version); \ - __macro(ctcGetStatusString); \ - __macro(compute_ctc_loss); \ - __macro(compute_ctc_loss_double); \ - __macro(get_workspace_size); \ - __macro(get_workspace_size_double) - -WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP); - -#undef DYNAMIC_LOAD_WARPCTC_WRAP - -} // namespace dynload -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h index 51f4ce86890..16b740d5523 100644 --- a/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/warpctc_grad_kernel_impl.h @@ -16,7 +16,6 @@ #include -#include "kernels/impl/warpctc.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/eigen/common.h" @@ -24,6 +23,7 @@ #include "paddle/phi/kernels/funcs/sequence_padding.h" #include "paddle/phi/kernels/funcs/sequence_scale.h" #include "paddle/utils/optional.h" +#include "third_party/warpctc/include/ctc.h" namespace phi { diff --git a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h index eb64f21c90f..cb39a0171ba 100644 --- a/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/warpctc_kernel_impl.h @@ -16,7 +16,6 @@ #include -#include "kernels/impl/warpctc.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/lod_utils.h" #include "paddle/phi/core/tensor_utils.h" @@ -25,6 +24,7 @@ #include "paddle/phi/kernels/funcs/sequence_padding.h" #include "paddle/phi/kernels/funcs/sequence_scale.h" #include "paddle/utils/optional.h" +#include "third_party/warpctc/include/ctc.h" namespace phi { @@ -58,16 +58,16 @@ class ComputeCtcLossFunctor { float* costs, void* workspace, ctcOptions options) { - return phi::dynload::compute_ctc_loss(activations, - gradients, - flat_labels, - label_lengths, - input_lengths, - static_cast(alphabet_size), - static_cast(minibatch), - costs, - workspace, - options); + return compute_ctc_loss(activations, + gradients, + flat_labels, + label_lengths, + input_lengths, + static_cast(alphabet_size), + static_cast(minibatch), + costs, + workspace, + options); } }; @@ -84,17 +84,16 @@ class ComputeCtcLossFunctor { double* costs, void* workspace, ctcOptions options) { - return phi::dynload::compute_ctc_loss_double( - activations, - gradients, - flat_labels, - label_lengths, - input_lengths, - static_cast(alphabet_size), - static_cast(minibatch), - costs, - workspace, - options); + return compute_ctc_loss_double(activations, + gradients, + flat_labels, + label_lengths, + input_lengths, + static_cast(alphabet_size), + static_cast(minibatch), + costs, + workspace, + options); } }; @@ -140,21 +139,19 @@ class WarpCTCFunctor { size_t workspace_bytes = 0; ctcStatus_t status = CTC_STATUS_UNKNOWN_ERROR; if (sizeof(T) == 4) { - status = - phi::dynload::get_workspace_size(cpu_label_lengths, - cpu_input_lengths, - static_cast(sequence_width), - static_cast(num_sequences), - options_, - &workspace_bytes); + status = get_workspace_size(cpu_label_lengths, + cpu_input_lengths, + static_cast(sequence_width), + static_cast(num_sequences), + options_, + &workspace_bytes); } else { - status = phi::dynload::get_workspace_size_double( - cpu_label_lengths, - cpu_input_lengths, - static_cast(sequence_width), - static_cast(num_sequences), - options_, - &workspace_bytes); + status = get_workspace_size_double(cpu_label_lengths, + cpu_input_lengths, + static_cast(sequence_width), + static_cast(num_sequences), + options_, + &workspace_bytes); } PADDLE_ENFORCE_EQ( CTC_STATUS_SUCCESS, @@ -162,7 +159,7 @@ class WarpCTCFunctor { errors::PreconditionNotMet( "warp-ctc [version %d] Error in get_workspace_size: %s", warpctc_version_, - phi::dynload::ctcGetStatusString(status))); + ctcGetStatusString(status))); PADDLE_ENFORCE_GT( workspace_bytes, 0UL, @@ -197,14 +194,15 @@ class WarpCTCFunctor { errors::PreconditionNotMet( "warp-ctc [version %d] Error in get_workspace_size: %s", warpctc_version_, - phi::dynload::ctcGetStatusString(status))); + ctcGetStatusString(status))); } protected: void init(const Context& dev_ctx, const size_t blank) { - warpctc_version_ = phi::dynload::get_warpctc_version(); + warpctc_version_ = get_warpctc_version(); - if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { + if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || + dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) options_.loc = CTC_GPU; options_.stream = diff --git a/backends/metax_gpu/kernels/impl/warprnnt.h b/backends/metax_gpu/kernels/impl/warprnnt.h deleted file mode 100644 index 50b0dfc0efc..00000000000 --- a/backends/metax_gpu/kernels/impl/warprnnt.h +++ /dev/null @@ -1,63 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // NOLINT - -#include "paddle/phi/backends/dynload/dynamic_loader.h" -#include "paddle/phi/common/port.h" -#include "third_party/warprnnt/include/rnnt.h" - -namespace phi { -namespace dynload { - -extern std::once_flag warprnnt_dso_flag; -extern void* warprnnt_dso_handle; - -/** - * The following macro definition can generate structs - * (for each function) to dynamic load warprnnt routine - * via operator overloading. - */ -#define DYNAMIC_LOAD_WARPRNNT_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using warprnntFunc = decltype(&::__name); \ - std::call_once(warprnnt_dso_flag, []() { \ - warprnnt_dso_handle = phi::dynload::GetWarpRNNTDsoHandle(); \ - }); \ - static void* p_##__name = dlsym(warprnnt_dso_handle, #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ - extern DynLoad__##__name __name - -#define DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP(__name) \ - DYNAMIC_LOAD_WARPRNNT_WRAP(__name) - -#define WARPRNNT_ROUTINE_EACH(__macro) \ - __macro(get_warprnnt_version); \ - __macro(rnntGetStatusString); \ - __macro(compute_rnnt_loss); \ - __macro(compute_rnnt_loss_fp64); \ - __macro(get_rnnt_workspace_size); - -WARPRNNT_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPRNNT_WRAP); - -#undef DYNAMIC_LOAD_WARPRNNT_WRAP - -} // namespace dynload -} // namespace phi diff --git a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h index 96e756b16b1..8e3ab6fcdac 100644 --- a/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/warprnnt_kernel_impl.h @@ -16,12 +16,12 @@ #include -#include "kernels/impl/warprnnt.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "third_party/warprnnt/include/rnnt.h" namespace phi { @@ -55,16 +55,16 @@ class ComputeRnntLossFunctor { float* costs, void* workspace, rnntOptions options) { - return phi::dynload::compute_rnnt_loss(activations, - gradients, - label, - label_lengths, - input_lengths, - static_cast(alphabet_size), - static_cast(minibatch), - costs, - workspace, - options); + return compute_rnnt_loss(activations, + gradients, + label, + label_lengths, + input_lengths, + static_cast(alphabet_size), + static_cast(minibatch), + costs, + workspace, + options); } }; @@ -81,16 +81,16 @@ class ComputeRnntLossFunctor { double* costs, void* workspace, rnntOptions options) { - return phi::dynload::compute_rnnt_loss_fp64(activations, - gradients, - label, - label_lengths, - input_lengths, - static_cast(alphabet_size), - static_cast(minibatch), - costs, - workspace, - options); + return compute_rnnt_loss_fp64(activations, + gradients, + label, + label_lengths, + input_lengths, + static_cast(alphabet_size), + static_cast(minibatch), + costs, + workspace, + options); } }; @@ -117,6 +117,7 @@ class WarpRNNTFunctor { * \param blank blank label used in rnnt loss function. * \param cpu_loss loss of each example in CPU memory. */ + void operator()(const Context& dev_ctx, const T* input, T* gradient, @@ -138,7 +139,8 @@ class WarpRNNTFunctor { // There is no memory allocated operations within warp-rnnt. rnntStatus_t status = RNNT_STATUS_UNKNOWN_ERROR; bool gpu = false; - if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { + if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || + dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpu = true; #else @@ -148,7 +150,7 @@ class WarpRNNTFunctor { } size_t workspace_bytes = 0; - status = phi::dynload::get_rnnt_workspace_size( + status = get_rnnt_workspace_size( maxT, maxU, B, gpu, &workspace_bytes, sizeof(T)); PADDLE_ENFORCE_EQ( @@ -157,7 +159,7 @@ class WarpRNNTFunctor { errors::PreconditionNotMet( "warp-rnnt [version %d] Error in get_rnnt_workspace_size: %s", warprnnt_version_, - phi::dynload::rnntGetStatusString(status))); + rnntGetStatusString(status))); PADDLE_ENFORCE_GT( workspace_bytes, 0UL, @@ -189,7 +191,7 @@ class WarpRNNTFunctor { errors::PreconditionNotMet( "warp-rnnt [version %d] Error in get_workspace_size: %s", warprnnt_version_, - phi::dynload::rnntGetStatusString(status))); + rnntGetStatusString(status))); } protected: @@ -199,7 +201,7 @@ class WarpRNNTFunctor { const size_t blank, const float fastemit_lambda, const int num_threads) { - warprnnt_version_ = phi::dynload::get_warprnnt_version(); + warprnnt_version_ = get_warprnnt_version(); options_.maxT = maxT; options_.maxU = maxU; @@ -207,7 +209,8 @@ class WarpRNNTFunctor { options_.fastemit_lambda = fastemit_lambda; options_.batch_first = true; - if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { + if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || + dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) options_.loc = RNNT_GPU; options_.stream = diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/addmm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/addmm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/addmm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/addmm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/batch_fc_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/batch_fc_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu index 062646bbf9d..52fe5a1d566 100644 --- a/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/batch_norm_grad_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/flags.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/batch_norm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/batch_norm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bilinear_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bilinear_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bilinear_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu index bc9eb23c0e8..42810569fde 100644 --- a/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/blha_get_max_len_register.cu @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/flash_attn_utils.h" #include "kernels/metax_kernel/block_attn.h" +#include "kernels/metax_kernel/flash_attn_utils.h" #include "paddle/phi/backends/context_pool.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bmm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bmm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/bmm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/bmm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/c_softmax_with_multi_label_cross_entropy_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/cholesky_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/cholesky_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu index e8fae2d9da5..8a39ae3f0a8 100644 --- a/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/cholesky_kernel_register.cu @@ -20,7 +20,7 @@ limitations under the License. */ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/conv_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/conv_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/conv_transpose_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/conv_transpose_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/crop_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/crop_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu index e94862ec7b0..043a64dc149 100644 --- a/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/cross_entropy_kernel_register.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/kernels/cross_entropy_kernel.h" #include "paddle/phi/kernels/full_kernel.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_grad_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_grad_kernel.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/depthwise_conv_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/depthwise_conv_kernel.cu diff --git a/backends/metax_gpu/kernels/elementwise.h b/backends/metax_gpu/kernels/metax_kernel/elementwise.h similarity index 100% rename from backends/metax_gpu/kernels/elementwise.h rename to backends/metax_gpu/kernels/metax_kernel/elementwise.h diff --git a/backends/metax_gpu/kernels/flags_declare.cu b/backends/metax_gpu/kernels/metax_kernel/flags_declare.cu similarity index 100% rename from backends/metax_gpu/kernels/flags_declare.cu rename to backends/metax_gpu/kernels/metax_kernel/flags_declare.cu diff --git a/backends/metax_gpu/kernels/flash_attn_grad_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_grad_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_grad_kernel.cu diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.cu diff --git a/backends/metax_gpu/kernels/flash_attn_kernel.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_kernel.h rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_kernel.h diff --git a/backends/metax_gpu/kernels/flash_attn_utils.h b/backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h similarity index 100% rename from backends/metax_gpu/kernels/flash_attn_utils.h rename to backends/metax_gpu/kernels/metax_kernel/flash_attn_utils.h diff --git a/backends/metax_gpu/kernels/flashattn.cc b/backends/metax_gpu/kernels/metax_kernel/flashattn.cc similarity index 100% rename from backends/metax_gpu/kernels/flashattn.cc rename to backends/metax_gpu/kernels/metax_kernel/flashattn.cc diff --git a/backends/metax_gpu/kernels/flashattn.h b/backends/metax_gpu/kernels/metax_kernel/flashattn.h similarity index 100% rename from backends/metax_gpu/kernels/flashattn.h rename to backends/metax_gpu/kernels/metax_kernel/flashattn.h diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/flatten2_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/flatten2_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/flatten2_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu similarity index 99% rename from backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu index ee4f105cbc5..c0d15b7f1b4 100644 --- a/backends/metax_gpu/kernels/cuda_kernels/fused_conv2d_add_act_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/fused_conv2d_add_act_kernel_register.cu @@ -308,7 +308,8 @@ class CudnnConvDescManager { int groups, cudnnDataType_t dtype) { auto* desc = new phi::backends::gpu::ConvolutionDescriptor(); - desc->set(dtype, paddings, strides, dilations, true, groups); + desc->set( + dtype, paddings, strides, dilations, phi::AllowTF32Cudnn(), groups); return desc; } diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/fused_rope_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/fused_rope_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu index d7540d949a9..bdf341f5a35 100644 --- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu +++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_grad_kerne_registerl.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu index db975d74665..e0c0ae9c1d6 100644 --- a/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/instance_norm_kernel_register.cu @@ -13,7 +13,7 @@ // limitations under the License. #include "glog/logging.h" -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/common/layout.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/layer_norm_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/layer_norm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/layer_norm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/layer_norm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/lstm_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/lstm_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu index 5a2d85418a1..72e4c5b2b79 100644 --- a/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/lu_kernel_register.cu @@ -18,7 +18,7 @@ #include "paddle/phi/backends/dynload/cusolver.h" #endif -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/enforce.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/lu_solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/lu_solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu index bda5dc62f1a..d8c3355e6e4 100644 --- a/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu +++ b/backends/metax_gpu/kernels/metax_kernel/matrix_rank_tol_kernel.cu @@ -18,7 +18,7 @@ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/common/type_traits.h" diff --git a/backends/metax_gpu/kernels/metax_context.cc b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc similarity index 69% rename from backends/metax_gpu/kernels/metax_context.cc rename to backends/metax_gpu/kernels/metax_kernel/metax_context.cc index 4df4d88b0b4..efddba5f00b 100644 --- a/backends/metax_gpu/kernels/metax_context.cc +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.cc @@ -12,10 +12,27 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" namespace phi { -bool AllowTF32Cudnn() { return false; } +const bool allow_tf32_cublas = []() -> bool { + const char* v = std::getenv("ALLOW_TF32_CUBLAS"); + if (v) { + return std::atoi(v); + } + return true; +}(); + +const bool allow_tf32_cudnn = []() -> bool { + const char* v = std::getenv("ALLOW_TF32_CUDNN"); + if (v) { + return std::atoi(v); + } + return false; +}(); + +bool AllowTF32Cublas() { return allow_tf32_cublas; } +bool AllowTF32Cudnn() { return allow_tf32_cudnn; } void DnnWorkspaceHandle::RunFuncSync( const std::function& cudnn_func, size_t required_workspace_bytes, @@ -57,32 +74,4 @@ void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) { allocation_.reset(); allocation_ = allocator_->Allocate(required_workspace_bytes); } - -static std::function blaslt_handle_creator_{nullptr}; -static blasLtHandle_t blaslt_handle_{nullptr}; -static std::once_flag flag_blaslt_; - -static void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) { -#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060 - mcblasLtCreate(blaslt_handle); -#elif defined(PADDLE_WITH_HIP) - phi::dynload::hipblasLtCreate(blaslt_handle); -#endif -} - -blasLtHandle_t GetBlasLtHandle() { - std::call_once(flag_blaslt_, [&]() { - if (!blaslt_handle_) { - if (!blaslt_handle_creator_) - InitBlasLtHandle(&blaslt_handle_); - else - blaslt_handle_ = blaslt_handle_creator_(); - } - }); - PADDLE_ENFORCE_NOT_NULL( - blaslt_handle_, - common::errors::InvalidArgument( - "The GPU blasLt handle is nullptr. It must not be null.")); - return blaslt_handle_; -} } // namespace phi diff --git a/backends/metax_gpu/kernels/metax_context.h b/backends/metax_gpu/kernels/metax_kernel/metax_context.h similarity index 95% rename from backends/metax_gpu/kernels/metax_context.h rename to backends/metax_gpu/kernels/metax_kernel/metax_context.h index 5974aadcc41..2d761439089 100644 --- a/backends/metax_gpu/kernels/metax_context.h +++ b/backends/metax_gpu/kernels/metax_kernel/metax_context.h @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ -#define BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ +#ifndef BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ +#define BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ #include #include #include @@ -27,11 +27,11 @@ #include "paddle/phi/core/attribute.h" #include "paddle/phi/core/device_context.h" -using blasLtHandle_t = struct mcblasLtContext*; - -blasLtHandle_t GetBlasLtHandle(); +cublasLtHandle_t GetBlasLtHandle(); namespace phi { +bool AllowTF32Cublas(); +bool AllowTF32Cudnn(); class DnnWorkspaceHandle { public: inline DnnWorkspaceHandle(Allocator* allocator, gpuStream_t stream) @@ -128,7 +128,6 @@ inline void InitCusolverDnHandle(cusolverDnHandle_t* handle, } } -bool AllowTF32Cudnn(); inline cusolverDnHandle_t GetCusolverDnHandle(gpuStream_t stream, Place place) { std::call_once(flag_cusolver_dn_, [&]() { if (!cusolver_dn_handle_) { @@ -160,4 +159,4 @@ inline DnnWorkspaceHandle GetDnnWorkspace(Allocator* alloactor, return DnnWorkspaceHandle(alloactor, stream); } } // namespace phi -#endif // BACKENDS_METAX_GPU_KERNELS_METAX_CONTEXT_H_ +#endif // BACKENDS_METAX_GPU_KERNELS_METAX_KERNEL_METAX_CONTEXT_H_ diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/multi_dot_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/multi_dot_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/mv_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/mv_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/mv_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/mv_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu index 745069e2eda..c3041254444 100644 --- a/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu +++ b/backends/metax_gpu/kernels/metax_kernel/qr_kernel_register.cu @@ -22,7 +22,7 @@ #include #include -#include "kernels/metax_context.h" +#include "kernels/metax_kernel/metax_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/enforce.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/rank_attention_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/rank_attention_kernel_register.cu diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc index 499832049e4..101b51aa350 100644 --- a/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc +++ b/backends/metax_gpu/kernels/metax_kernel/rnn_grad_kernel.cu.cc @@ -14,7 +14,7 @@ #include "paddle/phi/kernels/rnn_grad_kernel.h" -#include "kernels/metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" diff --git a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc index f1cf9e09dc7..2598ce093e6 100644 --- a/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc +++ b/backends/metax_gpu/kernels/metax_kernel/rnn_kernel.cu.cc @@ -15,7 +15,7 @@ #include "paddle/phi/kernels/rnn_kernel.h" #include "glog/logging.h" -#include "kernels/metax_context.h" //NOLINT +#include "kernels/metax_kernel/metax_context.h" //NOLINT #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/generator.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/slogdeterminant_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/slogdeterminant_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_grad_register.cu rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_grad_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/softmax_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/softmax_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/standard_gamma_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/standard_gamma_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/stft_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/stft_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/svd_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/svd_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu old mode 100755 new mode 100644 similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/top_k_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/top_k_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_grad_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_grad_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/triangular_solve_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/triangular_solve_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/warprnnt_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/warprnnt_kernel_register.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu b/backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/weight_only_linear_kernel.cu rename to backends/metax_gpu/kernels/metax_kernel/weight_only_linear_kernel.cu diff --git a/backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu b/backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu similarity index 100% rename from backends/metax_gpu/kernels/cuda_kernels/weight_quantize_kernel_register.cu rename to backends/metax_gpu/kernels/metax_kernel/weight_quantize_kernel_register.cu diff --git a/backends/metax_gpu/patch/intrinsics.cuh b/backends/metax_gpu/patch/intrinsics.cuh new file mode 100644 index 00000000000..71365b6577c --- /dev/null +++ b/backends/metax_gpu/patch/intrinsics.cuh @@ -0,0 +1,459 @@ +/****************************************************************************** + * Copyright (c) 2013, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * + * Code and text by Sean Baxter, NVIDIA Research + * See http://nvlabs.github.io/moderngpu for repository and documentation. + * + ******************************************************************************/ + +#include "devicetypes.cuh" + +#pragma once + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" + +namespace mgpu { + +MGPU_HOST_DEVICE uint2 ulonglong_as_uint2(uint64 x) { + return *reinterpret_cast(&x); +} +MGPU_HOST_DEVICE uint64 uint2_as_ulonglong(uint2 x) { + return *reinterpret_cast(&x); +} + +MGPU_HOST_DEVICE int2 longlong_as_int2(int64 x) { + return *reinterpret_cast(&x); +} +MGPU_HOST_DEVICE int64 int2_as_longlong(int2 x) { + return *reinterpret_cast(&x); +} + +MGPU_HOST_DEVICE int2 double_as_int2(double x) { + return *reinterpret_cast(&x); +} +MGPU_HOST_DEVICE double int2_as_double(int2 x) { + return *reinterpret_cast(&x); +} + +MGPU_HOST_DEVICE void SetDoubleX(double& d, int x) { + reinterpret_cast(&d)[0] = x; +} +MGPU_HOST_DEVICE int GetDoubleX(double d) { + return double_as_int2(d).x; +} +MGPU_HOST_DEVICE void SetDoubleY(double& d, int y) { + reinterpret_cast(&d)[1] = y; +} +MGPU_HOST_DEVICE int GetDoubleY(double d) { + return double_as_int2(d).y; +} + + +//////////////////////////////////////////////////////////////////////////////// +// PTX for bfe and bfi + +#if __CUDA_ARCH__ >= 200 + +MGPU_DEVICE uint bfe_ptx(uint x, uint bit, uint numBits) { + uint result; + asm("bfe.u32 %0, %1, %2, %3;" : + "=r"(result) : "r"(x), "r"(bit), "r"(numBits)); + return result; +} + + +MGPU_DEVICE uint bfi_ptx(uint x, uint y, uint bit, uint numBits) { + uint result; + asm("bfi.b32 %0, %1, %2, %3, %4;" : + "=r"(result) : "r"(x), "r"(y), "r"(bit), "r"(numBits)); + return result; +} + +MGPU_DEVICE uint prmt_ptx(uint a, uint b, uint index) { + uint ret; + asm("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index)); + return ret; +} + +#endif // __CUDA_ARCH__ >= 200 + + +//////////////////////////////////////////////////////////////////////////////// +// shfl_up + +__device__ __forceinline__ float shfl_up(float var, + unsigned int delta, int width = 32) { + +#if __CUDA_ARCH__ >= 300 +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + var = __shfl_up_sync(0xFFFFFFFF, var, delta, width); +#else + var = __shfl_up(var, delta, width); +#endif +#endif + return var; +} + +__device__ __forceinline__ double shfl_up(double var, + unsigned int delta, int width = 32) { + +#if __CUDA_ARCH__ >= 300 + int2 p = mgpu::double_as_int2(var); +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + p.x = __shfl_up_sync(0xFFFFFFFF, p.x, delta, width); + p.y = __shfl_up_sync(0xFFFFFFFF, p.y, delta, width); +#else + p.x = __shfl_up(p.x, delta, width); + p.y = __shfl_up(p.y, delta, width); +#endif + var = mgpu::int2_as_double(p); +#endif + + return var; +} + +//////////////////////////////////////////////////////////////////////////////// +// shfl_add + +// MGPU_DEVICE int shfl_add(int x, int offset, int width = WARP_SIZE) { +// int result = 0; +// #if __CUDA_ARCH__ >= 300 +// int mask = (WARP_SIZE - width)<< 8; +// #if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) +// asm( +// "{.reg .s32 r0;" +// ".reg .pred p;" +// "shfl.up.sync.b32 r0|p, %1, %2, %3, 0xFFFFFFFF;" +// "@p add.s32 r0, r0, %4;" +// "mov.s32 %0, r0; }" +// : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +// #else +// asm( +// "{.reg .s32 r0;" +// ".reg .pred p;" +// "shfl.up.b32 r0|p, %1, %2, %3;" +// "@p add.s32 r0, r0, %4;" +// "mov.s32 %0, r0; }" +// : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +// #endif +// #endif +// return result; +// } + +MGPU_DEVICE int shfl_add(int x, int offset, int width = 32) +{ +#if __CUDA_ARCH__ >= 300 + unsigned fullMask = 0xffffffffU; + unsigned mask = (width == 32) ? fullMask : ((1U << width) - 1U); + int src = 0; +#if defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 9 + src = __shfl_up_sync(mask, x, offset, width); // CUDA 9+ +#else + src = __shfl_up(x, offset, width); // CUDA 8- +#endif + int lane = threadIdx.x & 31; + return (lane >= offset) ? (src + x) : x; +#else + return x; +#endif +} + +MGPU_DEVICE int shfl_max(int x, int offset, int width = WARP_SIZE) { + int result = 0; +#if __CUDA_ARCH__ >= 300 + int mask = (WARP_SIZE - width)<< 8; +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) + asm( + "{.reg .s32 r0;" + ".reg .pred p;" + "shfl.up.sync.b32 r0|p, %1, %2, %3, 0xFFFFFFFF;" + "@p max.s32 r0, r0, %4;" + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +#else + asm( + "{.reg .s32 r0;" + ".reg .pred p;" + "shfl.up.b32 r0|p, %1, %2, %3;" + "@p max.s32 r0, r0, %4;" + "mov.s32 %0, r0; }" + : "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x)); +#endif +#endif + return result; +} + +//////////////////////////////////////////////////////////////////////////////// +// brev, popc, clz, bfe, bfi, prmt + +// Reverse the bits in an integer. +MGPU_HOST_DEVICE uint brev(uint x) { +#if __CUDA_ARCH__ >= 200 + uint y = __brev(x); +#else + uint y = 0; + for(int i = 0; i < 32; ++i) + y |= (1 & (x>> i))<< (31 - i); +#endif + return y; +} + +// Count number of bits in a register. +MGPU_HOST_DEVICE int popc(uint x) { +#if __CUDA_ARCH__ >= 200 + return __popc(x); +#else + int c; + for(c = 0; x; ++c) + x &= x - 1; + return c; +#endif +} + +// Count leading zeros - start from most significant bit. +MGPU_HOST_DEVICE int clz(int x) { +#if __CUDA_ARCH__ >= 200 + return __clz(x); +#else + for(int i = 31; i >= 0; --i) + if((1<< i) & x) return 31 - i; + return 32; +#endif +} + +// Find first set - start from least significant bit. LSB is 1. ffs(0) is 0. +MGPU_HOST_DEVICE int ffs(int x) { +#if __CUDA_ARCH__ >= 200 + return __ffs(x); +#else + for(int i = 0; i < 32; ++i) + if((1<< i) & x) return i + 1; + return 0; +#endif +} + +MGPU_HOST_DEVICE uint bfe(uint x, uint bit, uint numBits) { +#if __CUDA_ARCH__ >= 200 + return bfe_ptx(x, bit, numBits); +#else + return ((1<< numBits) - 1) & (x>> bit); +#endif +} + +MGPU_HOST_DEVICE uint bfi(uint x, uint y, uint bit, uint numBits) { + uint result; +#if __CUDA_ARCH__ >= 200 + result = bfi_ptx(x, y, bit, numBits); +#else + if(bit + numBits > 32) numBits = 32 - bit; + uint mask = ((1<< numBits) - 1)<< bit; + result = y & ~mask; + result |= mask & (x<< bit); +#endif + return result; +} + +MGPU_HOST_DEVICE uint prmt(uint a, uint b, uint index) { + uint result; +#if __CUDA_ARCH__ >= 200 + result = prmt_ptx(a, b, index); +#else + result = 0; + for(int i = 0; i < 4; ++i) { + uint sel = 0xf & (index>> (4 * i)); + uint x = ((7 & sel) > 3) ? b : a; + x = 0xff & (x>> (8 * (3 & sel))); + if(8 & sel) x = (128 & x) ? 0xff : 0; + result |= x<< (8 * i); + } +#endif + return result; +} + +// Find log2(x) and optionally round up to the next integer logarithm. +MGPU_HOST_DEVICE int FindLog2(int x, bool roundUp = false) { + int a = 31 - clz(x); + if(roundUp) a += !MGPU_IS_POW_2(x); + return a; +} + +//////////////////////////////////////////////////////////////////////////////// +// vset4 + +#if __CUDA_ARCH__ >= 300 + +// Performs four byte-wise comparisons and returns 1 for each byte that +// satisfies the conditional, and zero otherwise. +MGPU_DEVICE uint vset4_lt_add_ptx(uint a, uint b, uint c) { + uint result; + asm("vset4.u32.u32.lt.add %0, %1, %2, %3;" : + "=r"(result) : "r"(a), "r"(b), "r"(c)); + return result; +} +MGPU_DEVICE uint vset4_eq_ptx(uint a, uint b) { + uint result; + asm("vset4.u32.u32.eq %0, %1, %2, %3;" : + "=r"(result) : "r"(a), "r"(b), "r"(0)); + return result; +} +#endif // __CUDA_ARCH__ >= 300 + +MGPU_HOST_DEVICE uint vset4_lt_add(uint a, uint b, uint c) { + uint result; +#if __CUDA_ARCH__ >= 300 + result = vset4_lt_add_ptx(a, b, c); +#else + result = c; + if((0x000000ff & a) < (0x000000ff & b)) result += 0x00000001; + if((0x0000ff00 & a) < (0x0000ff00 & b)) result += 0x00000100; + if((0x00ff0000 & a) < (0x00ff0000 & b)) result += 0x00010000; + if((0xff000000 & a) < (0xff000000 & b)) result += 0x01000000; +#endif + return result; +} + +MGPU_HOST_DEVICE uint vset4_eq(uint a, uint b) { + uint result; +#if __CUDA_ARCH__ >= 300 + result = vset4_eq_ptx(a, b); +#else + result = 0; + if((0x000000ff & a) == (0x000000ff & b)) result = 0x00000001; + if((0x0000ff00 & a) == (0x0000ff00 & b)) result += 0x00000100; + if((0x00ff0000 & a) == (0x00ff0000 & b)) result += 0x00010000; + if((0xff000000 & a) == (0xff000000 & b)) result += 0x01000000; +#endif + return result; +} + +//////////////////////////////////////////////////////////////////////////////// +// + +MGPU_HOST_DEVICE uint umulhi(uint x, uint y) { +#if __CUDA_ARCH__ >= 100 + return __umulhi(x, y); +#else + uint64 product = (uint64)x * y; + return (uint)(product>> 32); +#endif +} + +//////////////////////////////////////////////////////////////////////////////// +// ldg() function defined for all devices and all types. Only compiles to __ldg +// intrinsic for __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 for types supported +// by __ldg in sm_32_intrinsics.h + +template +struct IsLdgType { + enum { value = false }; +}; +#define DEFINE_LDG_TYPE(T) \ + template<> struct IsLdgType { enum { value = true }; }; + +template::value> +struct LdgShim { + MGPU_DEVICE static T Ldg(const T* p) { + return *p; + } +}; + +#if __CUDA_ARCH__ >= 320 && __CUDA_ARCH__ < 400 + + // List of __ldg-compatible types from sm_32_intrinsics.h. + DEFINE_LDG_TYPE(char) + DEFINE_LDG_TYPE(short) + DEFINE_LDG_TYPE(int) + DEFINE_LDG_TYPE(long long) + DEFINE_LDG_TYPE(char2) + DEFINE_LDG_TYPE(char4) + DEFINE_LDG_TYPE(short2) + DEFINE_LDG_TYPE(short4) + DEFINE_LDG_TYPE(int2) + DEFINE_LDG_TYPE(int4) + DEFINE_LDG_TYPE(longlong2) + + DEFINE_LDG_TYPE(unsigned char) + DEFINE_LDG_TYPE(unsigned short) + DEFINE_LDG_TYPE(unsigned int) + DEFINE_LDG_TYPE(unsigned long long) + DEFINE_LDG_TYPE(uchar2) + DEFINE_LDG_TYPE(uchar4) + DEFINE_LDG_TYPE(ushort2) + DEFINE_LDG_TYPE(ushort4) + DEFINE_LDG_TYPE(uint2) + DEFINE_LDG_TYPE(uint4) + DEFINE_LDG_TYPE(ulonglong2) + + DEFINE_LDG_TYPE(float) + DEFINE_LDG_TYPE(double) + DEFINE_LDG_TYPE(float2) + DEFINE_LDG_TYPE(float4) + DEFINE_LDG_TYPE(double2) + + template struct LdgShim { + MGPU_DEVICE static T Ldg(const T* p) { + return __ldg(p); + } + }; +#endif + +template +MGPU_DEVICE T ldg(const T* p) { + return LdgShim::Ldg(p); +} + +//////////////////////////////////////////////////////////////////////////////// + +// Fast division for 31-bit integers. +// Uses the method in Hacker's Delight (2nd edition) page 228. +// Evaluates for denom > 1 and x < 2^31. +struct FastDivide { + uint denom; + uint coef; + uint shift; + + MGPU_HOST_DEVICE uint Divide(uint x) { + return umulhi(x, coef)>> shift; + } + MGPU_HOST_DEVICE uint Modulus(uint x) { + return x - Divide(x) * denom; + } + + explicit FastDivide(uint denom_) { + denom = denom_; + uint p = 31 + FindLog2(denom, true); + coef = (uint)(((1ull<< p) + denom - 1) / denom); + shift = p - 32; + } +}; + +#pragma GCC diagnostic pop + +} // namespace mgpu diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 8127caee61e..beefb730bf7 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -134,11 +134,11 @@ index c0080f0a5e..458ca3e2e8 100644 } // namespace phi diff --git a/paddle/phi/backends/dynload/cufft.h b/paddle/phi/backends/dynload/cufft.h -index 1547909d92..66b2779392 100644 +index 1547909d92..ef20838434 100644 --- a/paddle/phi/backends/dynload/cufft.h +++ b/paddle/phi/backends/dynload/cufft.h @@ -1,3 +1,4 @@ -+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. ++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); @@ -226,11 +226,11 @@ index c5309e7e11..3328571380 100644 } \ }; \ diff --git a/paddle/phi/backends/gpu/cuda/cuda_device_function.h b/paddle/phi/backends/gpu/cuda/cuda_device_function.h -index 4ff2e528a9..81421c8ca1 100644 +index 4ff2e528a9..23f7f4b583 100644 --- a/paddle/phi/backends/gpu/cuda/cuda_device_function.h +++ b/paddle/phi/backends/gpu/cuda/cuda_device_function.h @@ -1,3 +1,4 @@ -+// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. ++// 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); @@ -470,6 +470,24 @@ index 88663ec880..98b93072a3 100644 #include "paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h" #include "paddle/phi/kernels/funcs/detail/gru_kernel.h" +diff --git a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +index 4eae698648..5c047723ea 100644 +--- a/paddle/phi/kernels/funcs/layer_norm_impl.cu.h ++++ b/paddle/phi/kernels/funcs/layer_norm_impl.cu.h +@@ -43,11 +43,11 @@ template + using LayerNormParamType = typename CudnnDataType::BatchNormParamType; + + inline static int GetDesiredBlockDim(int64_t block_dim) { +- const int kMaxBlockDim = 512; ++ const int kMaxBlockDim = 256; + #ifdef __HIPCC__ + const int lwarpSize = 64; + #else +- const int lwarpSize = 32; ++ const int lwarpSize = 64; + #endif + return block_dim >= kMaxBlockDim ? kMaxBlockDim : lwarpSize; + } diff --git a/paddle/phi/kernels/funcs/math/context_project.h b/paddle/phi/kernels/funcs/math/context_project.h index 15e1a4a3c3..e4780538d7 100644 --- a/paddle/phi/kernels/funcs/math/context_project.h @@ -529,7 +547,7 @@ index 8b0baf5f5f..260482f124 100644 namespace phi { diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h -index e30d440ff3..3c74792690 100644 +index e30d440ff3..108edda7ca 100644 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h @@ -30,11 +30,11 @@ limitations under the License. */ @@ -728,7 +746,7 @@ index e30d440ff3..3c74792690 100644 - asm("bfi.b32 %0, %1, %2, %3, %4;" - : "=r"(ret) - : "r"(to_insert), "r"(val), "r"(pos), "r"(len)); -+ ++ + ret = (static_cast(val) << (32 - pos - len)) >> (32 - len); return ret; } @@ -738,7 +756,7 @@ index e30d440ff3..3c74792690 100644 int len) { uint64_t ret; - asm("bfe.u64 %0, %1, %2, %3;" : "=l"(ret) : "l"(val), "r"(pos), "r"(len)); -+ ++ + + ret = (static_cast(val) << (64 - pos - len)) >> (64 - len); return ret; @@ -751,7 +769,7 @@ index e30d440ff3..3c74792690 100644 - asm("bfi.b64 %0, %1, %2, %3, %4;" - : "=l"(ret) - : "l"(to_insert), "l"(val), "r"(pos), "r"(len)); -+ ++ + ret = (static_cast(val) << (64 - pos - len)) >> (64 - len); + return ret; @@ -763,7 +781,7 @@ index e30d440ff3..3c74792690 100644 int lane_id; - asm("mov.s32 %0, %%laneid;" : "=r"(lane_id)); - return lane_id; -+ ++ +// // >>>> PTX2CPP Success <<<< +// { +// (lane_id)=(threadIdx.x&(warpSize-1)); @@ -851,6 +869,19 @@ index e838778952..83e805e75a 100644 namespace phi { namespace fusion { +diff --git a/paddle/phi/kernels/gpu/correlation_kernel.cu b/paddle/phi/kernels/gpu/correlation_kernel.cu +index 4c93778bde..c7bdf8a2cc 100644 +--- a/paddle/phi/kernels/gpu/correlation_kernel.cu ++++ b/paddle/phi/kernels/gpu/correlation_kernel.cu +@@ -103,7 +103,7 @@ void CorrelationCUDAKernel(const Context &dev_ctx, + int stride2, + int corr_type_multiply, + DenseTensor *out) { +- bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU; ++ bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM; + PADDLE_ENFORCE_EQ( + is_gpu_place, + true, diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h index f0cca0f701..02ea957240 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv.h @@ -866,6 +897,19 @@ index f0cca0f701..02ea957240 100644 namespace phi { // To determine use cudnn or not. +diff --git a/paddle/phi/kernels/gpu/dgc_kernel.cu b/paddle/phi/kernels/gpu/dgc_kernel.cu +index c2ddfa1347..c6adf5a6de 100644 +--- a/paddle/phi/kernels/gpu/dgc_kernel.cu ++++ b/paddle/phi/kernels/gpu/dgc_kernel.cu +@@ -188,7 +188,7 @@ void DGCKernel(const Context& dev_ctx, + int buf_size = paddle::communication::dgc::get_buffer_size(k); + phi::Allocator::AllocationPtr tmp_ious_data; + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +- if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { ++ if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { + tmp_ious_data = phi::memory_utils::Alloc( + dev_ctx.GetPlace(), + buf_size, diff --git a/paddle/phi/kernels/gpu/gelu_funcs.h b/paddle/phi/kernels/gpu/gelu_funcs.h index 29fa252e96..4ae72b0935 100644 --- a/paddle/phi/kernels/gpu/gelu_funcs.h @@ -930,6 +974,19 @@ index 1bdbe1564c..f753b54bc6 100644 #include "paddle/phi/kernels/impl/qr_kernel_impl.h" #include "paddle/phi/kernels/impl/tril_triu_kernel_impl.h" #include "paddle/phi/kernels/lstsq_kernel.h" +diff --git a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu +index 05a977828f..5136608c41 100644 +--- a/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu ++++ b/paddle/phi/kernels/gpu/shuffle_batch_kernel.cu +@@ -58,7 +58,7 @@ void ShuffleBatchKernel(const Context& dev_ctx, + int64_t seed_int = 0; + if (seed.initialized()) { + const auto& seed_place = seed.place().GetType(); +- bool is_gpu_place = seed_place == phi::AllocationType::GPU; ++ bool is_gpu_place = seed_place == phi::AllocationType::GPU || seed_place == phi::AllocationType::CUSTOM; + if (is_gpu_place) { + // NOTE: We have overwritten GetKernelTypeForVar, so seed_place would + // not be CUDAPlace in practice. This case would only happen in Python diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h index 9bc5326c90..79b57a8203 100644 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h @@ -1037,7 +1094,7 @@ index 410fb3c560..009ce03440 100644 if (x == T{0.0}) { T inf = std::numeric_limits::infinity(); diff --git a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h -index 5ebbc8d2db..48acf8d0cd 100644 +index 5ebbc8d2db..c7b6c338e2 100644 --- a/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h +++ b/paddle/phi/kernels/impl/llm_int8_matmul_kernel_impl.h @@ -15,8 +15,9 @@ limitations under the License. */ @@ -1048,7 +1105,7 @@ index 5ebbc8d2db..48acf8d0cd 100644 -#include "paddle/phi/kernels/funcs/quant_dequant.h" +#include "kernels/funcs/blas/cublaslt.h" +#include "kernels/funcs/quant_dequant.h" -+#include "kernels/metax_context.h" ++#include "kernels/metax_kernel/metax_context.h" #pragma once @@ -1087,6 +1144,32 @@ index 6f03f76eeb..5fe2c3e7dc 100644 #include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/matrix_inverse.h" +diff --git a/paddle/phi/kernels/impl/merged_momentum_impl.h b/paddle/phi/kernels/impl/merged_momentum_impl.h +index 7b85903776..3f4b298807 100644 +--- a/paddle/phi/kernels/impl/merged_momentum_impl.h ++++ b/paddle/phi/kernels/impl/merged_momentum_impl.h +@@ -297,7 +297,7 @@ void MergedMomentumInnerCompute( + params_out[idx], + velocities_out[idx]); + VLOG(10) << "Launch MergedMomentum cpu kernel."; +- } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { ++ } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { + phi::funcs::ForRange for_range( + static_cast(dev_ctx), params[idx]->numel()); + const auto grad_type = grads[idx]->dtype(); +diff --git a/paddle/phi/kernels/impl/momentum_kernel_impl.h b/paddle/phi/kernels/impl/momentum_kernel_impl.h +index de5bcfc30b..eb2a9714f5 100644 +--- a/paddle/phi/kernels/impl/momentum_kernel_impl.h ++++ b/paddle/phi/kernels/impl/momentum_kernel_impl.h +@@ -457,7 +457,7 @@ void MomentumDenseImpl(const Context& dev_ctx, + regularization_coeff, + param_out, + velocity_out); +- } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU) { ++ } else if (dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU || dev_ctx.GetPlace().GetType() == phi::AllocationType::CUSTOM) { + funcs::ForRange for_range(dev_ctx, param.numel()); + const auto grad_type = grad.dtype(); + #define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type) \ diff --git a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h b/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h index 4099d8b506..baef2cd643 100644 --- a/paddle/phi/kernels/impl/spectral_norm_kernel_impl.h @@ -1100,23 +1183,3 @@ index 4099d8b506..baef2cd643 100644 #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" -diff --git a/third_party/flagcx b/third_party/flagcx -index 7c469f4af9..7e6c4cc3ca 160000 ---- a/third_party/flagcx -+++ b/third_party/flagcx -@@ -1 +1 @@ --Subproject commit 7c469f4af991bf0f64b8f76d66f8e307a5eaea3f -+Subproject commit 7e6c4cc3cad3fce9b3dedfe46a9d195d616e8ffa -diff --git a/third_party/flashattn b/third_party/flashattn -index 581e48aa69..749aca3807 160000 ---- a/third_party/flashattn -+++ b/third_party/flashattn -@@ -1 +1 @@ --Subproject commit 581e48aa693a17ec3676ec2715d46130310d318d -+Subproject commit 749aca380794b472096d4e7ea01dd252ab0887c9 -diff --git a/third_party/yaml-cpp b/third_party/yaml-cpp ---- a/third_party/yaml-cpp -+++ b/third_party/yaml-cpp -@@ -1 +1 @@ --Subproject commit 1d8ca1f35eb3a9c9142462b28282a848e5d29a91 -+Subproject commit 1d8ca1f35eb3a9c9142462b28282a848e5d29a91-dirty diff --git a/backends/metax_gpu/tests/CMakeLists.txt b/backends/metax_gpu/tests/CMakeLists.txt index d2e92f209ab..ded54233f24 100755 --- a/backends/metax_gpu/tests/CMakeLists.txt +++ b/backends/metax_gpu/tests/CMakeLists.txt @@ -5,30 +5,101 @@ enable_testing() find_package(Python REQUIRED COMPONENTS Interpreter) -file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "unittest/*.py") - -list( - APPEND - PYTHON_TEST_SCRIPTS - ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test/test_tril_triu_op.py -) - -list( - REMOVE_ITEM - PYTHON_TEST_SCRIPTS - ${CMAKE_CURRENT_LIST_DIR}/unittest/test_cumsum_op_metax.py - ${CMAKE_CURRENT_LIST_DIR}/unittest/test_softmax_with_cross_entropy_op_metax.py - ${CMAKE_CURRENT_LIST_DIR}/unittest/test_expand_v2_op_metax.py - ${CMAKE_CURRENT_LIST_DIR}/unittest/test_tril_triu_op_metax.py - ${CMAKE_CURRENT_LIST_DIR}/unittest/test_squared_l2_norm_op_metax.py) +set(PADDLE_LEGACY_TEST_PATH + ${CMAKE_CURRENT_LIST_DIR}/../../../Paddle/test/legacy_test) +set(METAX_UNIT_TEST_PATH ${CMAKE_CURRENT_LIST_DIR}/unit_test) + +file(GLOB_RECURSE PYTHON_TEST_SCRIPTS "${METAX_UNIT_TEST_PATH}/*.py") + +if(NOT TEST_LIST_FILE) + message( + STATUS + " is not set, default test list [ ${CMAKE_CURRENT_LIST_DIR}/default.txt ] will be used." + ) + file(STRINGS ${CMAKE_CURRENT_LIST_DIR}/default.txt TEST_PROGRAMS) + +else() + if(NOT EXISTS ${TEST_LIST_FILE}) + message(FATAL_ERROR " is not exist, please check it again.") + endif() + + file(STRINGS ${TEST_LIST_FILE} TEST_PROGRAMS) + + if(NOT TEST_PROGRAMS) + message(FATAL_ERROR " is empty.") + endif() + + set(PYTHON_TEST_SCRIPTS "") +endif() + +foreach(test_name ${TEST_PROGRAMS}) + set(CURRENT_TEST_PROGRAM ${PADDLE_LEGACY_TEST_PATH}/${test_name}.py) + if(NOT EXISTS ${CURRENT_TEST_PROGRAM}) + message(WARNING "${CURRENT_TEST_PROGRAM} is not exist, skip it.") + else() + list(APPEND PYTHON_TEST_SCRIPTS ${CURRENT_TEST_PROGRAM}) + endif() +endforeach() list(REMOVE_DUPLICATES PYTHON_TEST_SCRIPTS) + +if(NOT TEST_LIST_FILE) + list( + REMOVE_ITEM + PYTHON_TEST_SCRIPTS + # 精度问题 + ${PADDLE_LEGACY_TEST_PATH}/test_sum_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_max_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_cumsum_op.py + # core.cudnnversion + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_with_cross_entropy_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_softmax_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_add_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_gather_op.py + # op_test.py 里 self._get_places()接口的适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_pow_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_layer_norm_op.py + # device == "gpu" 适配问题 + ${PADDLE_LEGACY_TEST_PATH}/test_index_add_op.py + # paddle-gpu 报错一致 + ${PADDLE_LEGACY_TEST_PATH}/test_elementwise_div_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_stack_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_logical_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_mean_op.py + # paddle.device.cuda.get_device_properties + ${PADDLE_LEGACY_TEST_PATH}/test_transpose_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_randint_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_uniform_random_op.py + # needs check_grad with fp64 precision + ${PADDLE_LEGACY_TEST_PATH}/test_c_embedding_op.py + # CUDAPinnedPlace 问题 + ${PADDLE_LEGACY_TEST_PATH}/test_slice_op.py + ${PADDLE_LEGACY_TEST_PATH}/test_compare_op.py) +endif() + +if(LOG_OUTPUT_DIR AND NOT EXISTS ${LOG_OUTPUT_DIR}) + file(MAKE_DIRECTORY ${LOG_OUTPUT_DIR}) + message(WARNING "${LOG_OUTPUT_DIR} is not exist, create it now.") +endif() + foreach(test_script ${PYTHON_TEST_SCRIPTS}) get_filename_component(test_name ${test_script} NAME_WE) - add_test( - NAME "python_${test_name}" - COMMAND ${Python_EXECUTABLE} ${test_script} - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + if(LOG_OUTPUT_DIR) + set(test_log_file "${LOG_OUTPUT_DIR}/${test_name}.log") + + add_test( + NAME "python_${test_name}" + COMMAND sh -c + "${Python_EXECUTABLE} ${test_script} > ${test_log_file} 2>&1" + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + + else() + add_test( + NAME "python_${test_name}" + COMMAND ${Python_EXECUTABLE} ${test_script} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + set_tests_properties("python_${test_name}" PROPERTIES TIMEOUT 360) endforeach() diff --git a/backends/metax_gpu/tests/default.txt b/backends/metax_gpu/tests/default.txt new file mode 100644 index 00000000000..8e2c3bcdd7e --- /dev/null +++ b/backends/metax_gpu/tests/default.txt @@ -0,0 +1,67 @@ +test_accuracy_op +test_tril_triu_op +test_where_op +test_split_op +test_fill_constant_op +test_empty_op +test_sign_op +test_cast_op +test_index_add_op +test_unbind_op +test_put_along_axis_op +test_layer_norm_op +test_maximum_op +test_accuracy_op +test_strided_slice_op +test_sum_op +test_set_value_op +test_flatten_contiguous_range_op +test_top_k_op +test_subtract_op +test_softmax_op +test_cumsum_op +test_greater_equal_op +test_elementwise_div_op +test_top_k_v2_op +test_stack_op +test_one_hot_v2_op +test_fill_any_op +test_gather_op +test_reshape_op +test_index_put_op +test_bitwise_op +test_max_op +test_pad_op +test_elementwise_pow_op +test_uniform_random_op +test_scatter_op +test_cast_op +test_zeros_like_op +test_compare_op +test_shape_op +test_tril_triu_op +test_slice_op +test_elementwise_add_op +test_index_put_op +test_bincount_op +test_assign_op +test_logical_op +test_squared_l2_norm_op +test_mean_op +test_fused_bias_act_op +test_expand_v2_op +test_adamw_op +test_gather_nd_op +test_concat_op +test_scatter_nd_op +test_elementwise_floordiv_op +test_elementwise_mul_op +test_transpose_op +test_einsum_op +test_randint_op +test_c_embedding_op +test_numel_op +test_scale_op +test_softmax_with_cross_entropy_op +test_full_op +test_scatter_op diff --git a/backends/metax_gpu/tests/run_test.sh b/backends/metax_gpu/tests/run_test.sh index 95cce650e6b..b9e8ec5b5cc 100755 --- a/backends/metax_gpu/tests/run_test.sh +++ b/backends/metax_gpu/tests/run_test.sh @@ -2,13 +2,13 @@ #!/bin/bash # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -22,14 +22,61 @@ TEST_PATH1="${SCRIPT_DIR}/../../../python" TEST_PATH2="${SCRIPT_DIR}/../../../python/tests" export PYTHONPATH="${LEGACY_TEST_PATH}:${PYTHONPATH}:${TEST_PATH1}:${TEST_PATH2}" +export +# sleep 1000000 + rm -r build mkdir -p build && cd build -cmake .. +TEST_LOG_LEVEL=0 +TEST_LIST_FILE="" +TEST_LOG_OUTPUT_DIR="" +TEST_PARALLEL_NUM=10 -cmake --build . +while getopts "i:o:v:j:h" opt; do + case "$opt" in + i) + TEST_LIST_FILE="$OPTARG" + ;; + o) + TEST_LOG_OUTPUT_DIR="$OPTARG" + echo "Set log output dir [ $TEST_LOG_OUTPUT_DIR ]" + ;; + v) + TEST_LOG_LEVEL=$OPTARG + ;; + j) + TEST_PARALLEL_NUM="$OPTARG" + ;; + h) + echo "用法:$0 -i <测试列表文件> -o <日志输出路径> ..." + echo "选项说明:" + echo " -i 测试程序列表文件" + echo " -o 日志输出路径" + echo " -v GLOG_v 日志等级" + echo " -j ctest 测试并行数量" + echo " -h 显示帮助" + exit 0 + ;; + \?) + echo "error: unknow option '-$OPTARG'." + exit 1 + ;; + :) + echo "error option '-$OPTARG' must have parameter." + exit 1 + ;; + esac +done + + +export GLOG_v=$TEST_LOG_LEVEL -ctest -j1 --output-on-failure +cmake .. -DTEST_LIST_FILE=$TEST_LIST_FILE -DLOG_OUTPUT_DIR=$TEST_LOG_OUTPUT_DIR + +cmake --build . + +ctest -j$TEST_PARALLEL_NUM --output-on-failure diff --git a/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh b/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh new file mode 100644 index 00000000000..86bfcb08f86 --- /dev/null +++ b/backends/metax_gpu/tests/scripts/check_diff_metax_legacy_unit_test.sh @@ -0,0 +1,108 @@ +#!/bin/bash + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +SOURCE_DIR="backends/metax_gpu/tests/unittest" +SEARCH_DIR="Paddle/test/legacy_test" +PREFIX_FILE="metax_prefixes.txt" +UNMATCHED_FILE="unmatched_files.txt" +EXIST_FILE="existing_files.txt" +MISS_FILE="missing_files.txt" + +# 检查源路径是否存在 +if [ ! -d "$SOURCE_DIR" ]; then + echo "错误: 源路径 '$SOURCE_DIR' 不存在或不是一个目录" + exit 1 +fi + +# 检查搜索路径是否存在 +if [ ! -d "$SEARCH_DIR" ]; then + echo "错误: 搜索路径 '$SEARCH_DIR' 不存在或不是一个目录" + exit 1 +fi + +# 第一步:提取前缀(根据新规则处理) +echo "第一步:从 '$SOURCE_DIR' 提取文件前缀(按_op/_metax规则)..." +> "$PREFIX_FILE" # 清空前缀文件 +> "$UNMATCHED_FILE" # 清空未匹配文件列表 + +find "$SOURCE_DIR" -type f -name "*.py" | while read -r file; do + filename=$(basename "$file") + prefix="" + + # 规则1:如果包含_op关键字,提取_op前的所有字符 + if [[ "$filename" == *"_op"* ]]; then + prefix="${filename%%_op*}" + echo "提取前缀(_op规则): $prefix (来自 $filename)" + echo "$prefix" >> "$PREFIX_FILE" + + # 规则2:如果没有_op但有_metax,提取_metax前的所有字符 + elif [[ "$filename" == *"_metax"* ]]; then + prefix="${filename%%_metax*}" + echo "提取前缀(_metax规则): $prefix (来自 $filename)" + echo "$prefix" >> "$PREFIX_FILE" + + # 规则3:都不包含,归类到未匹配 + else + echo "未匹配的文件: $filename(不包含_op和_metax)" + echo "$filename" >> "$UNMATCHED_FILE" + fi +done + +# 检查是否有提取到前缀或未匹配文件 +prefix_count=$(wc -l < "$PREFIX_FILE") +unmatched_count=$(wc -l < "$UNMATCHED_FILE") + +echo "提取完成 - 有效前缀: $prefix_count 个,未匹配文件: $unmatched_count 个" + +if [ $prefix_count -eq 0 ] && [ $unmatched_count -eq 0 ]; then + echo "警告: 在 '$SOURCE_DIR' 中未找到任何以 '_metax.py' 结尾的文件" + exit 0 +fi + +# 第二步:在搜索路径中查找同名文件(仅搜索当前目录,不包括子文件夹) +echo -e "\n第二步:在 '$SEARCH_DIR' 中搜索同名文件(深度为1)..." +> "$EXIST_FILE" # 清空存在文件列表 +> "$MISS_FILE" # 清空缺失文件列表 + +# 逐个处理每个前缀 +while read -r prefix; do + # 跳过空行 + if [ -z "$prefix" ]; then + continue + fi + + # 只在搜索路径的直接目录下查找(深度为1) + found=$(find "$SEARCH_DIR" -maxdepth 1 -type f -name "${prefix}_op.py" -print -quit) + + if [ -n "$found" ]; then + echo "$prefix -> 找到文件: $found" + echo "${prefix}_op.py" >> "$EXIST_FILE" + else + echo "$prefix -> 未找到同名文件" + echo "$prefix" >> "$MISS_FILE" + fi +done < "$PREFIX_FILE" + +# 输出结果统计 +exist_count=$(wc -l < "$EXIST_FILE") +miss_count=$(wc -l < "$MISS_FILE") + +echo -e "\n处理完成!" +echo "找到同名文件的前缀数量: $exist_count(已保存到 $EXIST_FILE)" +echo "未找到同名文件的前缀数量: $miss_count(已保存到 $MISS_FILE)" +echo "未匹配规则的文件数量: $unmatched_count(已保存到 $UNMATCHED_FILE)" diff --git a/backends/metax_gpu/tests/scripts/classify.json b/backends/metax_gpu/tests/scripts/classify.json new file mode 100644 index 00000000000..b97255adc3d --- /dev/null +++ b/backends/metax_gpu/tests/scripts/classify.json @@ -0,0 +1,22 @@ +{ + "OK":{ + "skipped":{ + "rule":["skipped="] + } + }, + + "FAILED":{ + "precision":{ + "rule":["Mismatched elements"] + }, + "api":{ + "rule":["(PermissionDenied) Cannot use CUDAPinnedPlace", "ValueError: The API paddle.device.cuda.get_device_properties", "TypeError: paddle.index_add api"] + }, + "missing":{ + "rule":["missing metax_gpu kernel", "UnimplementedError: There are no kernels which are registered"] + }, + "file_not_found":{ + "rule":["FileNotFoundError:"] + } + } +} diff --git a/backends/metax_gpu/tests/scripts/log_analysis.py b/backends/metax_gpu/tests/scripts/log_analysis.py new file mode 100644 index 00000000000..963d50751f7 --- /dev/null +++ b/backends/metax_gpu/tests/scripts/log_analysis.py @@ -0,0 +1,215 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import fnmatch +import shutil +from enum import Enum + + +class TestResult(Enum): + OK = "OK" + FAILURE = "FAILED" + + +class LogAnalyzer: + def __init__( + self, + classify_file: str, + search_path: str, + pattern: str = None, + encoding: str = "utf-8", + ): + self.__patten = pattern + self.__search_path = search_path + self.__encoding = encoding + self.__statistical_data = {} + + self.__classify_data = self.__read_json_file(classify_file) + for key, value in self.__classify_data.items(): + self.__statistical_data[key] = {} + for sub_key in list(value.keys()): + self.__statistical_data[key][sub_key] = [] + + self.__statistical_data[TestResult.OK.value]["noskip"] = [] + self.__statistical_data[TestResult.FAILURE.value]["other"] = [] + + def __read_json_file(self, path: str) -> dict: + with open(path, "r", encoding=self.__encoding) as f: + data = json.load(f) + f.close() + return data + + def __check_path(self, path: str) -> None: + """ + 处理指定路径: + - 若为文件夹路径:不存在则创建,存在则清空内容 + - 若为文件路径:不存在则创建,存在则清空内容 + """ + try: + # 判断路径是否存在 + if os.path.exists(path): + # 路径存在,判断是文件还是文件夹 + if os.path.isfile(path): + # 处理文件:清空内容 + with open(path, "w", encoding="utf-8") as f: + f.write("") # 写入空内容清空文件 + # print(f"文件已存在,已清空内容: {path}") + + elif os.path.isdir(path): + # 处理文件夹:清空所有内容 + for item in os.listdir(path): + item_path = os.path.join(path, item) + if os.path.isfile(item_path) or os.path.islink(item_path): + os.remove(item_path) # 删除文件或链接 + elif os.path.isdir(item_path): + shutil.rmtree(item_path) # 递归删除子文件夹 + # print(f"文件夹已存在,已清空内容: {path}") + else: + # 路径不存在,判断目标类型(根据最后一个元素是否有扩展名) + # 获取路径的最后一部分 + last_part = os.path.basename(path) + + # 判断是否为文件路径(包含扩展名) + if "." in last_part and not last_part.endswith("."): + # 创建文件(包括父目录) + parent_dir = os.path.dirname(path) + if parent_dir and not os.path.exists(parent_dir): + os.makedirs(parent_dir, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + pass # 创建空文件 + # print(f"文件不存在,已创建: {path}") + + else: + # 创建文件夹(支持多级目录) + os.makedirs(path, exist_ok=True) + # print(f"文件夹不存在,已创建: {path}") + + except PermissionError: + print(f"权限错误:无法操作路径 {path}") + except Exception as e: + print(f"处理路径时发生错误: {str(e)}") + + def save_result(self, dir_path: str = "./") -> None: + """ + 判断文件夹是否存在: + - 不存在则创建 + - 存在则清空文件夹内所有内容(保留文件夹本身) + """ + + for key, value in self.__statistical_data.items(): + sub_dir = os.path.join(dir_path, key) + self.__check_path(sub_dir) + + for sub_key, sub_value in value.items(): + # print(f"{sub_key}: {len(value[sub_key])} - ({sub_value})") + try: + with open( + os.path.join(sub_dir, sub_key) + ".txt", "w", encoding="utf-8" + ) as f: + for op_name in sub_value: + if not op_name.endswith("\n"): + op_name += "\n" + f.write(op_name) + # print(f"内容已成功{'追加' if append else '写入'}到 {file_path}") + except Exception as e: + print(f"写入文件失败: {e}") + + def show_result(self) -> None: + test_counts = 0 + for key, value in self.__statistical_data.items(): + print(f"\n---------- {key} ----------") + for sub_key, sub_value in value.items(): + test_counts = test_counts + len(value[sub_key]) + print(f"{sub_key}: {len(value[sub_key])}\n\t{sub_value}\n") + print( + f"\n******************* Total log num: {test_counts} *******************\n\n" + ) + + def run(self): + """ + 读取指定目录下符合命名规则的文件,并遍历每一行 + + 参数: + search_path: 要搜索的根目录 + pattern: 文件名匹配规则(支持通配符,如 '*.txt', 'file_*.log') + """ + for dirpath, dirnames, filenames in os.walk(self.__search_path): + for filename in fnmatch.filter(filenames, self.__patten): + file_path = os.path.join(dirpath, filename) + # print(f"\n===== 正在处理文件: {file_path} =====") + + cur_res_type = TestResult.FAILURE + cur_sub_type = "other" + finish_early = False + + try: + with open(file_path, "r", encoding=self.__encoding) as f: + for line in f: + for sub_type, sub_type_params in self.__classify_data[ + cur_res_type.value + ].items(): + for keyword in sub_type_params["rule"]: + if keyword in line: + cur_sub_type = sub_type + if sub_type == "missing": + finish_early = True + break + + if finish_early: + break + + if finish_early: + break + + if len(line) >= 2 and line[:2] == "OK": + cur_res_type = TestResult.OK + cur_sub_type = None + for sub_type, sub_type_params in self.__classify_data[ + cur_res_type.value + ].items(): + for rule in sub_type_params["rule"]: + if rule in line: + cur_sub_type = sub_type + break + + op_name = filename.split(".") + if cur_sub_type is None: + self.__statistical_data[cur_res_type.value][ + "noskip" + ].append(op_name[0]) + else: + self.__statistical_data[cur_res_type.value][ + cur_sub_type + ].append(op_name[0]) + # print(f"Result: {cur_res_type.value}, type: {cur_sub_type}") + f.close() + except UnicodeDecodeError: + print(f"警告: 文件 {file_path} 编码不是 utf-8,跳过处理") + except Exception as e: + print(f"处理文件 {file_path} 时出错: {str(e)}") + + +if __name__ == "__main__": + + analyzer = LogAnalyzer( + classify_file="./classify.json", + search_path="./NPU_logs/20250918_065326", + pattern="test_*.log", + ) + + analyzer.run() + analyzer.show_result() + analyzer.save_result("./output") diff --git a/backends/metax_gpu/tests/unit_test/test_abs_metax.py b/backends/metax_gpu/tests/unit_test/test_abs_metax.py new file mode 100644 index 00000000000..0dae6822bba --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_abs_metax.py @@ -0,0 +1,39 @@ +# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +# # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +import paddle.base.dygraph as dg + + +class TestAbs(unittest.TestCase): + def setUp(self): + self._dtypes = ["float32"] + self._places = [paddle.CustomPlace("metax_gpu", 0)] + + def test_all_positive(self): + for dtype in self._dtypes: + x = 1 + 10 * np.random.random([13, 3, 3]).astype(dtype) + for place in self._places: + with dg.guard(place): + y = paddle.abs(paddle.to_tensor(x)) + np.testing.assert_allclose(np.abs(x), y.numpy(), rtol=1e-05) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_arange_metax.py b/backends/metax_gpu/tests/unit_test/test_arange_metax.py new file mode 100644 index 00000000000..89308c33401 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_arange_metax.py @@ -0,0 +1,260 @@ +# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +# # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from op_test import OpTest, convert_float_to_uint16 + +import paddle +from paddle.base import core +from paddle.static import Program, program_guard + + +def arange_wrapper(start, end, step, dtype="float32"): + return paddle.arange(start, end, step, dtype) + + +class TestArangeOp(OpTest): + def setUp(self): + self.op_type = "range" + self.init_config() + self.inputs = { + "Start": np.array([self.case[0]]).astype(self.dtype), + "End": np.array([self.case[1]]).astype(self.dtype), + "Step": np.array([self.case[2]]).astype(self.dtype), + } + + self.outputs = { + "Out": np.arange(self.case[0], self.case[1], self.case[2]).astype( + self.dtype + ) + } + + def init_config(self): + self.dtype = np.float32 + self.python_api = arange_wrapper + self.case = (0, 1, 0.2) + + def test_check_output(self): + self.check_output(check_pir=True, check_symbol_infer=False) + + +class TestFloatArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.float32 + self.python_api = paddle.arange + self.case = (0, 5, 1) + + +class TestFloat16ArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.float16 + self.python_api = paddle.arange + self.case = (0, 5, 1) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and not support the bfloat16", +) +class TestBFloat16ArangeOp(OpTest): + def setUp(self): + self.op_type = "range" + self.init_config() + self.inputs = { + "Start": convert_float_to_uint16(self.start), + "End": convert_float_to_uint16(self.end), + "Step": convert_float_to_uint16(self.step), + } + + self.outputs = { + "Out": convert_float_to_uint16(np.arange(self.start, self.end, self.step)) + } + + def init_config(self): + self.dtype = np.uint16 + self.python_api = arange_wrapper + self.case = (0, 5, 1) + self.start = np.array([self.case[0]]).astype(np.float32) + self.end = np.array([self.case[1]]).astype(np.float32) + self.step = np.array([self.case[2]]).astype(np.float32) + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, check_pir=True, check_symbol_infer=False) + + +class TestInt32ArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.int32 + self.python_api = paddle.arange + self.case = (0, 5, 2) + + +class TestFloat64ArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.float64 + self.python_api = paddle.arange + self.case = (10, 1, -2) + + +class TestInt64ArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.int64 + self.python_api = paddle.arange + self.case = (-1, -10, -2) + + +class TestZeroSizeArangeOp(TestArangeOp): + def init_config(self): + self.dtype = np.int32 + self.python_api = paddle.arange + self.case = (0, 0, 1) + + +class TestArangeOpError(unittest.TestCase): + def test_static_errors(self): + with program_guard(Program(), Program()): + paddle.enable_static() + self.assertRaises(TypeError, paddle.arange, 10, dtype="int8") + + +class TestArangeAPI(unittest.TestCase): + def test_out(self): + paddle.enable_static() + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + x1 = paddle.arange(0, 5, 1, "float32") + + place = ( + paddle.CUDAPlace(0) + if core.is_compiled_with_cuda() + else paddle.CPUPlace() + ) + exe = paddle.static.Executor(place) + out = exe.run(fetch_list=[x1]) + + expected_data = np.arange(0, 5, 1).astype(np.float32) + self.assertEqual((out == expected_data).all(), True) + self.assertListEqual(list(x1.shape), [5]) + paddle.disable_static(place) + + +class TestArangeImperative(unittest.TestCase): + def test_out(self): + place = ( + paddle.CUDAPlace(0) if core.is_compiled_with_cuda() else paddle.CPUPlace() + ) + paddle.disable_static(place) + x1 = paddle.arange(0, 5, 1) + x2 = paddle.tensor.arange(5) + x3 = paddle.tensor.creation.arange(5) + + start = paddle.to_tensor(np.array([0], "float32")) + end = paddle.to_tensor(np.array([5], "float32")) + step = paddle.to_tensor(np.array([1], "float32")) + x4 = paddle.arange(start, end, step, "int64") + + expected_data = np.arange(0, 5, 1).astype(np.int64) + for x in [x1, x2, x3, x4]: + np.testing.assert_array_equal(x.numpy(), expected_data) + + start_float = paddle.to_tensor(np.array([0.5], "float32")) + end_float = paddle.to_tensor(np.array([1.5], "float32")) + step_float = paddle.to_tensor(np.array([0.5], "float32")) + # all [start, end, step] is float + x5 = paddle.arange(start_float, end_float, step_float) + x5_expected_data = np.arange(0.5, 1.5, 0.5).astype(np.float32) + np.testing.assert_array_equal(x5.numpy(), x5_expected_data) + self.assertEqual(x5.numpy().dtype, np.float32) + + # [start, end] is float , [step] is int + x6 = paddle.arange(start_float, end_float, 1) + x6_expected_data = np.arange(0.5, 1.5, 1).astype(np.float32) + np.testing.assert_array_equal(x6.numpy(), x6_expected_data) + self.assertEqual(x6.numpy().dtype, np.float32) + + # [start] is float , [end] is int + x7 = paddle.arange(start_float, 1) + x7_expected_data = np.arange(0.5, 1).astype(np.float32) + np.testing.assert_array_equal(x7.numpy(), x7_expected_data) + self.assertEqual(x7.numpy().dtype, np.float32) + + # [start] is float + x8 = paddle.arange(start_float) + x8_expected_data = np.arange(0.5).astype(np.float32) + np.testing.assert_array_equal(x8.numpy(), x8_expected_data) + self.assertEqual(x8.numpy().dtype, np.float32) + + # [start] is int + x9 = paddle.arange(1) + x9_expected_data = np.arange(1).astype(np.int64) + np.testing.assert_array_equal(x9.numpy(), x9_expected_data) + self.assertEqual(x9.numpy().dtype, np.int64) + + # [start] is float + x10 = paddle.arange(1.0) + x10_expected_data = np.arange(1).astype(np.float32) + np.testing.assert_array_equal(x10.numpy(), x10_expected_data) + self.assertEqual(x10.numpy().dtype, np.float32) + + # [start] is np.int + x11 = paddle.arange(np.int64(10)) + x11_expected_data = np.arange(10).astype(np.int64) + np.testing.assert_array_equal(x11.numpy(), x11_expected_data) + self.assertEqual(x11.numpy().dtype, np.int64) + + # [start] is a big integer + x12 = paddle.arange( + start=0, + end=-9007199254740994, + step=-9007199254740993, + ) + + # numpy give wrong result here, so we generate 'x12_expected_data' manually + # x12_expected_data = np.arange(start=0, stop=-9007199254740994, step=-9007199254740993, dtype=np.int64) + x12_expected_data = np.array([0, -9007199254740993]) + + np.testing.assert_array_equal(x12.numpy(), x12_expected_data) + self.assertEqual(x12.numpy().dtype, np.int64) + + # [startend step>0] + x14 = paddle.arange(start=10, end=0, step=1) + + x14_expected_data = np.array([]) + np.testing.assert_array_equal(x14.numpy(), x14_expected_data) + + paddle.enable_static() + + +class TestArangeStatic(unittest.TestCase): + def test_infermeta(self): + paddle.enable_static() + x = paddle.arange(0, 1 + 0.005, 0.005) + self.assertEqual(x.shape, [201]) + paddle.disable_static() + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py b/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py new file mode 100644 index 00000000000..f575d4eece0 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_bfloat16_embedding_metax.py @@ -0,0 +1,72 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F + + +class BF16EmbeddingTest(unittest.TestCase): + def setUp(self): + self.batch_size = 30 + self.vocab_size = 1024 + self.hidden_size = 512 + self.seed = 10 + + def run_main(self, dtype): + ids, weight, dout = self.gen_random() + origin_dtype = weight.dtype + weight_cast = weight.astype(dtype) + out = F.embedding(ids, weight_cast) + dout = dout.astype(out.dtype) + dweight = paddle.autograd.grad(out, weight, dout) + return ( + out.astype(origin_dtype).numpy(), + dweight[0].astype(origin_dtype).numpy(), + ) + + def gen_random(self): + np.random.seed(self.seed) + weight = np.random.random([self.vocab_size, self.hidden_size]).astype("float32") + ids = np.random.randint(low=0, high=self.vocab_size, size=[self.batch_size]) + dout = np.random.random([self.batch_size, self.hidden_size]).astype("float32") + + weight = paddle.to_tensor(weight) + weight.stop_gradient = False + ids = paddle.to_tensor(ids) + dout = paddle.to_tensor(dout) + return ids, weight, dout + + def test_main(self): + + ret1 = self.run_main("float32") + ret2 = self.run_main("bfloat16") + self.assertEqual(len(ret1), len(ret2)) + for i, (r1, r2) in enumerate(zip(ret1, ret2)): + np.testing.assert_allclose(r1, r2, atol=1e-3, rtol=1e-2) + + +class BF16EmbeddingTestOddHiddenSize(BF16EmbeddingTest): + def setUp(self): + self.batch_size = 30 + self.vocab_size = 511 + self.hidden_size = 512 + self.seed = 20 + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py b/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py new file mode 100644 index 00000000000..57a5d0b1c97 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_count_nonzero_api_metax.py @@ -0,0 +1,81 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle + +np.random.seed(10) + + +class TestCountNonzeroAPI(unittest.TestCase): + # test paddle.tensor.math.count_nonzero + + def setUp(self): + self.x_shape = [2, 3, 4, 5] + self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32) + self.place = paddle.CustomPlace("metax_gpu", 0) + + def test_api_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data("X", self.x_shape) + out1 = paddle.count_nonzero(x) + out2 = paddle.tensor.count_nonzero(x) + out3 = paddle.tensor.math.count_nonzero(x) + axis = np.arange(len(self.x_shape)).tolist() + out4 = paddle.count_nonzero(x, axis) + out5 = paddle.count_nonzero(x, tuple(axis)) + exe = paddle.static.Executor(self.place) + res = exe.run(feed={"X": self.x}, fetch_list=[out1, out2, out3, out4, out5]) + out_ref = np.count_nonzero(self.x) + for out in res: + np.testing.assert_allclose(out, out_ref, rtol=1e-05) + + def test_api_dygraph(self): + paddle.disable_static(self.place) + + def test_case(x, axis=None, keepdim=False): + x_tensor = paddle.to_tensor(x) + out = paddle.count_nonzero(x_tensor, axis=axis, keepdim=keepdim) + if isinstance(axis, list): + axis = tuple(axis) + if len(axis) == 0: + axis = None + + out_ref = np.count_nonzero(x, axis, keepdims=keepdim) + np.testing.assert_allclose(out.numpy(), out_ref, rtol=1e-05) + + test_case(self.x) + test_case(self.x, None) + test_case(self.x, -1) + test_case(self.x, keepdim=True) + test_case(self.x, 2, keepdim=True) + test_case(self.x, [0, 2]) + test_case(self.x, (0, 2)) + test_case(self.x, (0, 1, 3)) + test_case(self.x, [0, 1, 2, 3]) + paddle.enable_static() + + def test_errors(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data("X", [10, 12], "int32") + self.assertRaises(ValueError, paddle.count_nonzero, x, axis=10) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py b/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py new file mode 100644 index 00000000000..73e389324f9 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_gaussian_nll_loss_metax.py @@ -0,0 +1,208 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F +from paddle.base import core + +np.random.seed(10) + + +def ref_gaussian_nll_loss( + input, label, variance, full=False, eps=1e-6, reduction="none" +): + if variance.shape != input.shape: + if input.shape[:-1] == variance.shape: + variance = np.expand_dims(variance, -1) + elif input.shape[:-1] == variance.shape[:-1] and variance.shape[-1] == 1: + pass + else: + raise ValueError("variance is of incorrect size") + if reduction != "none" and reduction != "mean" and reduction != "sum": + raise ValueError(reduction + " is not valid") + + if np.any(variance < 0): + raise ValueError("var has negative entry/entries") + + variance = variance.copy() + variance = np.clip(variance, a_min=eps, a_max=None) + + loss = 0.5 * (np.log(variance) + (input - label) ** 2 / variance) + if full: + loss += 0.5 * np.log(2 * np.pi) + + if reduction == "none": + return loss + elif reduction == "sum": + return [np.sum(loss)] + elif reduction == "mean": + return [np.mean(loss)] + + +class TestGaussianNLLLossAPI(unittest.TestCase): + # test paddle.nn.functional.gaussian_nll_loss, paddle.nn.gaussian_nll_loss + + def setUp(self, type=None): + self.shape = [10, 2] + if type in ["float16", "float64", "int32", "int64"]: + dtype = np.dtype(type) + self.input_np = np.random.random(self.shape).astype(dtype) + self.label_np = np.random.random(self.shape).astype(dtype) + self.variance_np = np.ones(self.shape).astype(dtype) + elif type == "broadcast1": + self.shape = [10, 2, 3] + self.broadcast_shape = [10, 2] + self.input_np = np.random.random(self.shape).astype(np.float32) + self.label_np = np.random.random(self.shape).astype(np.float32) + self.variance_np = np.ones(self.broadcast_shape).astype(np.float32) + elif type == "broadcast2": + self.shape = [10, 2, 3] + self.broadcast_shape = [10, 2, 1] + self.input_np = np.random.random(self.shape).astype(np.float32) + self.label_np = np.random.random(self.shape).astype(np.float32) + self.variance_np = np.ones(self.broadcast_shape).astype(np.float32) + else: + dtype = np.dtype("float32") + self.input_np = np.random.random(self.shape).astype(dtype) + self.label_np = np.random.random(self.shape).astype(dtype) + self.variance_np = np.ones(self.shape).astype(dtype) + if type == "test_err": + self.variance_np = -np.ones(self.shape).astype(np.float32) + + self.place = ( + paddle.CUDAPlace(0) if core.is_compiled_with_cuda() else paddle.CPUPlace() + ) + + def test_dynamic_case(self, type=None, full=False, reduction="none"): + self.setUp(type) + paddle.disable_static(self.place) + + input_x = paddle.to_tensor(self.input_np) + label = paddle.to_tensor(self.label_np) + variance = paddle.to_tensor(self.variance_np) + if type in ["test_err", "int32", "int64"]: + self.assertRaises( + ValueError, + paddle.nn.functional.gaussian_nll_loss, + input=input_x, + label=label, + variance=variance, + ) + else: + out_ref = ref_gaussian_nll_loss( + self.input_np, + self.label_np, + self.variance_np, + full=full, + reduction=reduction, + ) + out1 = F.gaussian_nll_loss( + input_x, label, variance, full=full, reduction=reduction + ) + gaussian_nll_loss = paddle.nn.GaussianNLLLoss(full, reduction=reduction) + out2 = gaussian_nll_loss(input_x, label, variance) + + for r in [out1, out2]: + np.allclose(out_ref, r.numpy(), rtol=1e-5, atol=1e-5) + paddle.enable_static() + + def test_static_case(self, type=None, full=False, reduction="none"): + self.setUp(type) + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + if type in ["int32", "int64", "float64"]: + input_x = paddle.static.data("Input_x", self.shape, type) + label = paddle.static.data("Label", self.shape, type) + variance = paddle.static.data("Variance", self.shape, type) + elif type in ["broadcast1", "broadcast2"]: + input_x = paddle.static.data("Input_x", self.shape) + label = paddle.static.data("Label", self.shape) + variance = paddle.static.data("Variance", self.broadcast_shape) + else: + input_x = paddle.static.data("Input_x", self.shape, "float32") + label = paddle.static.data("Label", self.shape, "float32") + variance = paddle.static.data("Variance", self.shape, "float32") + out1 = F.gaussian_nll_loss( + input_x, label, variance, full=full, reduction=reduction + ) + gaussian_nll_loss = paddle.nn.GaussianNLLLoss(full, reduction=reduction) + out2 = gaussian_nll_loss(input_x, label, variance) + exe = paddle.static.Executor(self.place) + if type not in ["test_err", "int32", "int64"]: + out_ref = ref_gaussian_nll_loss( + self.input_np, + self.label_np, + self.variance_np, + full=full, + reduction=reduction, + ) + res = exe.run( + feed={ + "Input_x": self.input_np, + "Label": self.label_np, + "Variance": self.variance_np, + }, + fetch_list=[out1, out2], + ) + for r in res: + np.allclose(out_ref, r, rtol=1e-5, atol=1e-5) + else: + try: + res = exe.run( + feed={ + "Input_x": self.input_np, + "Label": self.label_np, + "Variance": self.variance_np, + }, + fetch_list=[out1, out2], + ) + except ValueError: + pass + + def test_api(self): + self.test_dynamic_case() + self.test_static_case() + + def test_float64(self): + self.test_dynamic_case("float64") + self.test_static_case("float64") + + def test_broadcast(self): + self.test_dynamic_case("broadcast1") + self.test_static_case("broadcast1") + + def test_broadcast_with_same_dim(self): + self.test_dynamic_case("broadcast2") + self.test_static_case("broadcast2") + + def test_reduction(self): + self.test_dynamic_case(full=True, reduction="mean") + self.test_dynamic_case(full=True, reduction="sum") + self.test_static_case(full=True, reduction="mean") + + def test_error(self): + self.test_dynamic_case("test_err") + self.test_static_case("test_err") + + def test_int(self): + self.test_dynamic_case("int64") + self.test_dynamic_case("int32") + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_greater_equal.py b/backends/metax_gpu/tests/unit_test/test_greater_equal.py new file mode 100644 index 00000000000..816d6075099 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_greater_equal.py @@ -0,0 +1,44 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +import paddle +from paddle import static + + +class Test_Greater_Equal_Op_Fp16(unittest.TestCase): + def test_api_fp16(self): + paddle.enable_static() + with static.program_guard(static.Program(), static.Program()): + label = paddle.to_tensor([3, 3], dtype="float16") + limit = paddle.to_tensor([3, 2], dtype="float16") + out = paddle.greater_equal(x=label, y=limit) + # if core.is_compiled_with_cuda(): + # place = paddle.CUDAPlace(0) + # exe = static.Executor(place) + # (res,) = exe.run(fetch_list=[out]) + # self.assertEqual((res == np.array([True, True])).all(), True) + place = paddle.CustomPlace(paddle.device.get_device().split(":")[0], 0) + exe = static.Executor(place) + (res,) = exe.run(fetch_list=[out]) + self.assertEqual((res == np.array([True, True])).all(), True) + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py new file mode 100644 index 00000000000..b4e4282c5ce --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_build_src_rank_and_local_expert_id.py @@ -0,0 +1,62 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import unittest + +import numpy as np + +import paddle +from paddle.incubate.nn.functional import build_src_rank_and_local_expert_id + +logger = logging.getLogger(__name__) + + +class TestFusedCalculateAuxLoss(unittest.TestCase): + def test_build_src_rank_and_local_expert_id(self): + def orig_func(expert_num_global_list, num_local_experts): + send_rank_cpu = np.concatenate( # TOO SLOW!!! break every thing + [ + np.full([j], i // num_local_experts, dtype="int32") + for i, j in enumerate(expert_num_global_list) + ], + 0, + ) + local_expert_id_cpu = np.concatenate( + [ + np.full([j], i % num_local_experts, dtype="int32") + for i, j in enumerate(expert_num_global_list) + ], + 0, + ) + send_rank = paddle.to_tensor(send_rank_cpu) + local_expert_id = paddle.to_tensor(local_expert_id_cpu) + return send_rank, local_expert_id + + def fused_func(expert_num_global_tensor, expert_num_global, num_local_experts): + return build_src_rank_and_local_expert_id( + expert_num_global_tensor, expert_num_global, num_local_experts + ) + + expert_num_global = np.random.randint(0, 512, size=[12 * 8], dtype="int32") + expert_num_global_tensor = paddle.to_tensor(expert_num_global, dtype="int64") + + s1, l1 = orig_func(expert_num_global, 12) + s2, l2 = fused_func(expert_num_global_tensor, expert_num_global, 12) + assert ((s1 - s2) == 0).all(), (s1, s2) + assert ((l1 - l2) == 0).all(), (l1, l2) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py new file mode 100644 index 00000000000..2d5670ee739 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_expand_modality_expert_id.py @@ -0,0 +1,172 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from collections import namedtuple +from functools import partial + +from ernie_utils.moe_all_gather_layer import MOEAllGatherLayerV2 + +import paddle +import paddle.nn.functional as F +from paddle.incubate.nn.functional import expand_modality_expert_id + + +def fused_gate_logits_process_ref(self, gate_logits_lm, gate_logits_mm, token_type_ids): + """process gatelogits""" + top_k = self.k + num_expert_per_rank_per_modality = ( + gate_logits_lm.shape[-1] // self.config.moe_world_size + ) + + @paddle.no_grad() + def shift_ids(ids, modality_offset): + # 现在认为所以模态的 expert 数都一样 + rank = ids // num_expert_per_rank_per_modality + expert_id_in_rank = ids % num_expert_per_rank_per_modality + return ( + rank * (num_expert_per_rank_per_modality * 2) + + expert_id_in_rank + + modality_offset * num_expert_per_rank_per_modality + ) + + if self.group_experts: + gate_logits_lm = gate_logits_lm.reshape([gate_logits_lm.shape[0], top_k, -1]) + prob_lm = self.gate.act(gate_logits_lm) + weight_lm, expert_id_lm = prob_lm.topk(k=1, axis=-1) + weight_lm = weight_lm.reshape([gate_logits_lm.shape[0], -1]) + expert_id_lm = expert_id_lm.reshape([gate_logits_lm.shape[0], -1]) + group_size = gate_logits_lm.shape[-1] + scale = paddle.arange(0, top_k * group_size, group_size).unsqueeze(0) + expert_id_lm = expert_id_lm + scale + else: + prob_lm = self.gate.act(gate_logits_lm) + weight_lm, expert_id_lm = prob_lm.topk(k=top_k, axis=-1) + if token_type_ids is not None: + expert_id_lm = shift_ids(expert_id_lm, 0) + expert_id_lm.stop_gradient = True + lm_weight_and_expert_id = paddle.concat( + [weight_lm, expert_id_lm.astype("float32")], -1 + ) + if token_type_ids is None: + return ( + lm_weight_and_expert_id, + prob_lm.reshape([prob_lm.shape[0], -1]), + None, + ) + + prob_mm = self.gate.act(gate_logits_mm) + weight_mm, expert_id_mm = prob_mm.topk(k=top_k, axis=-1) + + expert_id_mm = shift_ids(expert_id_mm, 1) + expert_id_mm.stop_gradient = True + + mm_weight_and_expert_id = paddle.concat( + [weight_mm, expert_id_mm.astype("float32")], -1 + ) + + token_type_ids_float = token_type_ids[:, None].astype("float32") + weight_and_expert = ( + 1 - token_type_ids_float + ) * lm_weight_and_expert_id + token_type_ids_float * mm_weight_and_expert_id + return weight_and_expert, prob_lm.reshape([prob_lm.shape[0], -1]), prob_mm + + +def test_expand_modality_expert_id(): + def expand_id_one( + expert_id, + num_expert_per_modality, + k, + group_size, + modality_offset, + is_group_expert, + ): + orig_shape = expert_id.shape + expert_id = expert_id.reshape([-1]) + xid = paddle.arange(len(expert_id)) + if is_group_expert: + eid = xid % k + expert_id += eid * group_size + + rank = expert_id // num_expert_per_modality + expert_id_in_rank = expert_id % num_expert_per_modality + ret = ( + rank * (num_expert_per_modality * 2) + + expert_id_in_rank + + modality_offset * num_expert_per_modality + ) + return ret.reshape(orig_shape) + + S, E, k = 100, 24, 3 + expert_id_mm = paddle.randint(0, 12, shape=[S, k]) + num_expert_per_rank_per_modality = E // 2 // 4 + group_size = E // 2 // k + print(f"num_expert_per_rank_per_modality: {num_expert_per_rank_per_modality}") + fused = expand_modality_expert_id( + expert_id_mm, num_expert_per_rank_per_modality, group_size, 1, True + ) + + nonfused = expand_id_one( + expert_id_mm, num_expert_per_rank_per_modality, k, group_size, 1, True + ) + # num_expert_per_rank_per_modality, group_size + assert (fused == nonfused).all().item() + + Config = namedtuple("Config", ["moe_world_size"]) + Self = namedtuple( + "Self", + [ + "config", + "k", + "gate", + "group_experts", + "moe_statics", + "use_correction_bias", + ], + ) + Gate = namedtuple("Gate", ["act"]) + fake_gate = Gate(act=partial(F.softmax, axis=-1)) + fake_self = Self( + config=Config( + moe_world_size=8, + ), + k=k, + gate=fake_gate, + moe_statics=None, + use_correction_bias=False, + group_experts=True, + ) + + fake_logits = paddle.randn([S, E]) + fake_logits_mm = paddle.randn([S, E]) + token_type_ids = paddle.randint(0, 2, shape=[S]) + w_and_e, prob_lm, prob_mm = MOEAllGatherLayerV2.fused_gate_logits_process_fused( + fake_self, fake_logits, fake_logits_mm, None + ) + w_and_e_ref, prob_lm_ref, prob_mm_ref = fused_gate_logits_process_ref( + fake_self, fake_logits, fake_logits_mm, None + ) + assert (prob_lm == prob_lm_ref).all().item() + assert (w_and_e == w_and_e_ref).all().item() + w, e = w_and_e_ref.chunk(2, axis=-1) + + +class Test_expand_modality_expert_id_API(unittest.TestCase): + def test_dygraph(self): + test_expand_modality_expert_id() + + +if __name__ == "__main__": + + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py b/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py new file mode 100644 index 00000000000..ca0a780e908 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_fused_rmsnorm_ext_metax.py @@ -0,0 +1,95 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle.incubate.nn.functional import fused_rms_norm_ext + + +class TestFusedRMSNorm(unittest.TestCase): + def setUp(self): + paddle.seed(2023) + np.random.seed(2023) + + def rms_norm_reference(self, x, scale, bias=None, epsilon=1e-5): + variance = paddle.mean(paddle.square(x), axis=-1, keepdim=True) + + rms = paddle.sqrt(variance + epsilon) + y = x / rms + y = y * scale.reshape([1, -1]) + if bias is not None: + y = y + bias.reshape([1, -1]) + return y, (1.0 / rms).squeeze(-1) + + def test_2d_input(self): + rows, cols = 32, 64 + x = paddle.randn([rows, cols]) + scale = paddle.randn([cols]) + y_fused, invvar_fused = fused_rms_norm_ext(x, scale) + + y_ref, invvar_ref = self.rms_norm_reference(x, scale) + + np.testing.assert_allclose(y_fused, y_ref, rtol=1e-5, atol=1e-5) + np.testing.assert_allclose(invvar_fused, invvar_ref, rtol=1e-5, atol=1e-5) + + def test_without_bias(self): + + rows, cols = 32, 64 + x = paddle.randn([rows, cols]) + scale = paddle.randn([cols]) + + y_fused, invvar_fused = fused_rms_norm_ext(x, scale) + + y_ref, invvar_ref = self.rms_norm_reference(x, scale) + + np.testing.assert_allclose(y_fused, y_ref, rtol=1e-5, atol=1e-5) + np.testing.assert_allclose(invvar_fused, invvar_ref, rtol=1e-5, atol=1e-5) + + def test_backward(self): + + rows, cols = 16, 32 + x = paddle.randn([rows, cols], dtype="float32") + x.stop_gradient = False + scale = paddle.randn([cols], dtype="float32") + scale.stop_gradient = False + + y_fused, invvar = fused_rms_norm_ext(x, scale) + + loss = paddle.mean(y_fused) + loss.backward() + + x_grad_fused = x.grad.clone() + scale_grad_fused = scale.grad.clone() + + x.clear_gradient() + scale.clear_gradient() + + y_ref, invvar_ref = self.rms_norm_reference(x, scale) + loss_ref = paddle.mean(y_ref) + loss_ref.backward() + + x_grad_ref = x.grad + scale_grad_ref = scale.grad + + np.testing.assert_allclose(x_grad_fused, x_grad_ref, rtol=1e-4, atol=1e-4) + np.testing.assert_allclose( + scale_grad_fused, scale_grad_ref, rtol=1e-4, atol=1e-4 + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py new file mode 100644 index 00000000000..23df4e3457b --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_combine.py @@ -0,0 +1,193 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import random +import unittest + +import numpy as np +from ernie_utils.moe_layer_uneven import GateCombine + +import paddle +import paddle.nn.functional as F +from paddle.incubate.nn.functional import moe_combine + +os.environ["FLAGS_flash_attn_version"] = "v1" +os.environ["FLAGS_cudnn_deterministic"] = "1" +os.environ["FLAGS_embedding_deterministic"] = "1" + + +def combining(x, combine_weights, scatter_index, hard_gate=False): + """ + Args: + x: Tensor[seq, dim] + combine_weights: [seq, k] + scatter_index: ** [seq, k] ** + + Returns: + y: Tensor[s, dim] + """ + x_gatherd = F.embedding(scatter_index, x) # [s,k,dim] + if hard_gate: + return x_gatherd.squeeze(-2) + # logger.info(f'combinning: {combine_weights}') + y = (combine_weights.unsqueeze(-1) * x_gatherd).sum(1) + # y = paddle.matmul(combine_weights.unsqueeze(1), x_gatherd).squeeze() # [s,1,k] @ [s,k,dim] -> [s,1,dim] + return y + + +def baseline_result(x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy): + """baseline_result""" + scatter_index = paddle.to_tensor(scatter_index_numpy) + x = paddle.to_tensor(x_numpy).cast("float32") + x.stop_gradient = False + + combine_weights = paddle.to_tensor(combine_weights_numpy).cast("float32") + combine_weights.stop_gradient = False + + scatter_index = paddle.to_tensor(scatter_index_numpy) + grad = paddle.to_tensor(grad_numpy).cast("float32") + + y = combining(x, combine_weights, scatter_index) + paddle.autograd.backward([y], [grad], True) + return [x.grad, combine_weights.grad, y] + + +def test_moe_combine(x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy): + """baseline_result""" + x = paddle.to_tensor(x_numpy).cast("float32") + x.stop_gradient = False + + combine_weights = paddle.to_tensor(combine_weights_numpy).cast("float32") + combine_weights.stop_gradient = False + + scatter_index = paddle.to_tensor(scatter_index_numpy).cast("int32") + grad = paddle.to_tensor(grad_numpy).cast("float32") + + y = GateCombine.apply(x, combine_weights, scatter_index) + paddle.autograd.backward([y], [grad], True) + # grad.backward() + return [x.grad, combine_weights.grad, y] + + +def gen_test_case(S, K, Dim, capacity_factor, seed=1234): + """gen_test_case""" + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + x_numpy = np.random.rand(int(S * capacity_factor), Dim).astype(np.float32) + combine_weights_numpy = np.random.rand(S, K).astype(np.float32) + scatter_index_numpy = np.random.permutation(max(x_numpy.shape[0], S * K))[ + : S * K + ].astype("int64") + scatter_index_numpy = scatter_index_numpy.reshape([S, K]) + + combine_weights_numpy[scatter_index_numpy >= x_numpy.shape[0]] = 0 + scatter_index_numpy[scatter_index_numpy >= x_numpy.shape[0]] = 0 + grad_numpy = np.random.randn(S, Dim).astype(np.float32) + return x_numpy, combine_weights_numpy, scatter_index_numpy, grad_numpy + + +def testing(test_case): + """testing""" + [bl_x_grad, bl_combine_weights_grad, bl_y] = baseline_result(*test_case) + [fused_x_grad, fused_combine_weights_grad, fused_y] = test_moe_combine(*test_case) + np.testing.assert_allclose( + fused_y.astype("float32").numpy(), + bl_y.astype("float32").numpy(), + err_msg="fwd precision not pass", + rtol=1e-6, + ) + np.testing.assert_allclose( + fused_x_grad.astype("float32").numpy(), + bl_x_grad.astype("float32").numpy(), + rtol=1e-6, + err_msg="bwd grad precision not pass", + ) + np.testing.assert_allclose( + fused_combine_weights_grad.astype("float32").numpy(), + bl_combine_weights_grad.astype("float32").numpy(), + rtol=1e-6, + ) + + +class TestFused(unittest.TestCase): + @unittest.skipIf(moe_combine is None, "test_moe_combine not installed") + def test_cap_lt_2( + self, + ): + """ + 测试精度对齐的功能 + + Args: + 无参,没有任何参数。 + + Returns: + NoneType:测试通过时返回None;测试失败时抛出异常。 + + """ + testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=1.8)) + + @unittest.skipIf(moe_combine is None, "test_moe_combine not installed") + def test_cap_eq_2( + self, + ): + """ + 测试精度对齐的功能 + + Args: + 无参,没有任何参数。 + + Returns: + NoneType:测试通过时返回None;测试失败时抛出异常。 + + """ + testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=2)) + + @unittest.skipIf(moe_combine is None, "test_moe_combine not installed") + def test_cap_gt_2( + self, + ): + """ + 测试精度对齐的功能 + + Args: + 无参,没有任何参数。 + + Returns: + NoneType:测试通过时返回None;测试失败时抛出异常。 + + """ + testing(gen_test_case(S=1024, K=2, Dim=4096, capacity_factor=2.2)) + + @unittest.skipIf(moe_combine is None, "test_moe_combine not installed") + def test_k_gt_2( + self, + ): + """ + 测试精度对齐的功能 + + Args: + 无参,没有任何参数。 + + Returns: + NoneType:测试通过时返回None;测试失败时抛出异常。 + + """ + testing(gen_test_case(S=1024, K=8, Dim=4096, capacity_factor=2)) + + +if __name__ == "__main__": + + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py new file mode 100644 index 00000000000..4c209970629 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_partial_nosoftmaxtopk.py @@ -0,0 +1,218 @@ +# ruff: noqa: C419 +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import paddle +from paddle.incubate.nn.functional import ( + moe_gate_dispatch, + moe_gate_dispatch_partial_nosoftmaxtopk, +) + + +def test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op(): + + s, d, e = 4, 100, 8 + k, cap = 4, 3 + local_expert_num = 2 + + # x = paddle.randn([s, d]) + # gate_logits = paddle.randn([s, e]) + x = paddle.arange(1, s + 1).unsqueeze(-1).expand([s, d]).astype("bfloat16") + x_ = x.clone().detach() + + t = ( + (paddle.arange(0, e)).unsqueeze(0) + paddle.arange(0, -s, -1).unsqueeze(-1) + ) % e + gate_logits = (1 / (t + 1)).astype("float32") + # gate_logits = F.softmax(paddle.randn([s,e]),-1).astype('float32') + gate_logits_ = gate_logits.clone().detach() + s = x.shape[0] + d = x.shape[1] + e = gate_logits.shape[1] + x.stop_gradient = False + x_.stop_gradient = False + gate_logits.stop_gradient = False + gate_logits_.stop_gradient = False + print(f"gate_logits:{gate_logits}") + + def check_ascend(index_rev, chunks): + for idx in index_rev.split(chunks.tolist()): + if len(idx) > 2: + assert (paddle.diff(idx) >= 0).all(), (index_rev,) + + ys, comm, scatter_idx = [], [], [] + for ilocal_expert in range(0, e, local_expert_num): + combine_weihgts, expert_id = gate_logits.topk(k=k, axis=1) + ( + y, + combine_weihgts, + scatter_index, + scatter_index_rev, + expert_offset, + expert_num_local, + ) = moe_gate_dispatch_partial_nosoftmaxtopk( + x, + combine_weihgts, + expert_id.astype("int32"), + k=k, + capacity=cap, + num_experts=gate_logits.shape[-1], + use_pad=False, + expert_start_index=ilocal_expert, + expert_end_index=ilocal_expert + local_expert_num, # k # cap + reverse_token_drop=False, + ) + check_ascend(scatter_index_rev, expert_num_local) + print(f"y:{y.mean(-1)}") + print(f"combine_weihgts:{combine_weihgts}") + print(f"expert_num_local:{expert_num_local}") + print(f"scatter_index:{scatter_index.transpose([1,0])}") + print(f"scatter_index_rev:{scatter_index_rev}") + + ys.append(y) + comm.append(combine_weihgts) + scatter_idx.append(scatter_index) + + comm_sum = paddle.stack(comm).sum(0) + ys_sum = paddle.concat(ys) + + ( + y_, + combine_weihgts_, + scatter_index_, + expert_offset_, + expert_id_, + ) = moe_gate_dispatch( + x_, + gate_logits_, + None, + k=k, + capacity=cap, + use_pad=True, # k # cap + ) + valid_y = y_.sum(-1) > 0.0 + y_2 = y_[valid_y].squeeze() + + print( + f""" + y: {ys_sum.astype("float32").mean(axis=-1)} + y_: {y_2.astype("float32").mean(axis=-1)} + + comm-weight: {comm_sum} + comm-weight_: {combine_weihgts_} + + expert_id:{expert_id} + scatter_index:{scatter_index} + scatter_index_rev: {scatter_index_rev} + expert_num_global:{expert_offset} + expert_num_local:{expert_num_local} + """ + ) + + print("<<< begin backward>>>") + + assert combine_weihgts_.shape == combine_weihgts.shape, ( + combine_weihgts_.shape, + combine_weihgts.shape, + ) + + dysum, dcombine_weights_sum = paddle.ones_like(ys_sum), paddle.randn( + comm_sum.shape + ).astype(comm_sum.dtype) + dy_, dcombine_weights_ = paddle.ones_like(y_), paddle.ones_like(combine_weihgts_) + dy_[~valid_y] = 0 + + y_shapes = [len(y) for y in ys] + for dyy, yy, commm in zip( + paddle.split(dysum, y_shapes), + ys, + comm, + ): + print(f"dyy:{dyy.shape}, {yy.shape} {commm.shape}") + paddle.autograd.backward([yy, commm], [dyy, dcombine_weights_sum]) + print(x.grad.astype("float32").mean(axis=-1)) + print(f"bwd original:{y_.shape} {dy_.shape}") + paddle.autograd.backward([y_, combine_weihgts_], [dy_, dcombine_weights_]) + + print(x_.grad.astype("float32").mean(axis=-1)) + + print( + f""" + x: {x.grad.astype('float32').mean(axis=-1)} + x_: {x_.grad.astype('float32').mean(axis=-1)} + """ + ) + + +def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop(): + + S, E, D = 3, 4, 3 + k = 2 + capacity = 2 + x = (paddle.arange(S) + 1).unsqueeze(-1).expand([S, D]).astype("bfloat16") + cw = paddle.randn([S, k]) + eid = paddle.to_tensor([[0, 1], [0, 1], [0, 2]], dtype="int32") # 1 # 2 # 3 + ( + y, + cw_, + idx, + idx_rev, + num_ex_global, + num_ex_local, + ) = moe_gate_dispatch_partial_nosoftmaxtopk( + x, cw, eid, k, capacity, E, False, 0, 2, reverse_token_drop=True + ) + + y0, y1 = y.split([i for i in num_ex_local.tolist() if i > 0]) + assert y0[:, 0].astype("int32").tolist() == [2, 3], y0[:, 0] + assert y1[:, 0].astype("int32").tolist() == [1, 2] + + +def test_moe_ops_partial_nosoftmax_topk_empty_output(): + + S, E, D = 3, 4, 3 + k = 2 + capacity = 2 + x = (paddle.arange(S) + 1).unsqueeze(-1).expand([S, D]).astype("bfloat16") + cw = paddle.randn([S, k]) + paddle.device.synchronize() + eid = paddle.to_tensor([[0, 1], [0, 1], [0, 2]], dtype="int32") # 1 # 2 # 3 + ( + y, + cw_, + idx, + idx_rev, + num_ex_global, + num_ex_local, + ) = moe_gate_dispatch_partial_nosoftmaxtopk( + x, cw, eid, k, capacity, E, False, 3, 4, reverse_token_drop=True + ) + assert all([i == 0 for i in num_ex_local.tolist()]), num_ex_local + + +class TestAddition(unittest.TestCase): + def test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op(self): + test_moe_dispatch_partial_nosoftmaxtopk_nonepad_op() + + def test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop(self): + test_moe_ops_partial_nosoftmaxtopk_w_reverse_token_drop() + + def test_moe_ops_partial_nosoftmax_topk_empty_output(self): + test_moe_ops_partial_nosoftmax_topk_empty_output() + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py new file mode 100644 index 00000000000..19752abd904 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute.py @@ -0,0 +1,207 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F +from paddle.incubate.nn.functional import ( + moe_gate_dispatch, + moe_gate_dispatch_permute, +) + +os.environ["FLAGS_flash_attn_version"] = "v1" +os.environ["FLAGS_cudnn_deterministic"] = "1" +os.environ["FLAGS_embedding_deterministic"] = "1" + + +class TestFused(unittest.TestCase): + def test_moe_ops(self): + """ + test `moe-ops` w/ bias + """ + S, E, D = 8192, 64, 128 + k = 4 + x = paddle.randn([S, D], dtype="bfloat16") + gate_logits = paddle.randn([S, E], dtype="float32") + x_ = x.clone() + gate_logits_ = gate_logits.clone() + x.stop_gradient = True + x_.stop_gradient = True + gate_logits.stop_gradient = True + gate_logits_.stop_gradient = True + bias = paddle.zeros([E], dtype="float32") + cap = 512 + + ( + y, + combine_weihgts, + scatter_index, + expert_offset_, + expert_id_, + ) = moe_gate_dispatch( + x, + gate_logits, + None, + k=k, + capacity=cap, + use_pad=True, # k # cap + ) + + ( + y_, + combine_weihgts_, + scatter_index_, + expert_offset_, + expert_id_, + ) = moe_gate_dispatch( + x_, + gate_logits_, + bias + 1, # +1也不会破坏路由结果 + k=k, + capacity=cap, + use_pad=True, # k # cap + ) + bias_unbalanced = bias.clone() + bias_unbalanced[0] += 1 + ( + y__, + combine_weihgts__, + scatter_index__, + expert_offset__, + expert_id__, + ) = moe_gate_dispatch( + x_, + gate_logits_, + bias_unbalanced, + k=k, + capacity=cap, + use_pad=True, # k # cap + ) + np.testing.assert_equal( + y.astype("float32").numpy(), + y_.astype("float32").numpy(), + err_msg="incubate w bias not match", + ) + # bias 不影响 prob 概率 + np.testing.assert_equal( + combine_weihgts.astype("float32").numpy(), + combine_weihgts_.astype("float32").numpy(), + err_msg="incubate w bias not match", + ) + np.testing.assert_( + (y.astype("float32").numpy(0) != y__.astype("float32").numpy()).any(), + ) + + +class TestDispatchPermute(unittest.TestCase): + def get_detached_input(self, input, prob): + ret_input = input.detach() + ret_prob = prob.detach() + ret_input.stop_gradient = input.stop_gradient + ret_prob.stop_gradient = prob.stop_gradient + return ret_input, ret_prob + + def get_stage_input_list(self, x, world_size, stage): + print(world_size, stage, x.shape) + x = x.reshape([world_size * stage, -1, x.shape[-1]]) + stage_input_list = [] + x_list = paddle.split(x, num_or_sections=(world_size * stage), axis=0) + for stage_id in range(stage): + stage_input_list.append( + paddle.unsqueeze(paddle.concat(x_list[stage_id::stage], axis=0), axis=0) + ) + stage_input_list = paddle.concat(stage_input_list, axis=0) + return stage_input_list + + def test_moe_permute_ops(self): + paddle.seed(2025) + + test_cases = [ + (8, 4, 2), + (64, 16, 32), + (1024, 1024, 1024), + (8, 2, 4), + (4096, 4096, 4096), + ] + cases = list(zip(*test_cases)) + for _, case in enumerate(cases): + world_size, num_experts, num_tokens, k, hidden_size = case + capacity = num_tokens // k + stages = num_experts // world_size + + input = paddle.randn([num_tokens, hidden_size], dtype="float32") + prob_logits = paddle.randn([num_tokens, num_experts], dtype="float32") + prob = F.softmax(prob_logits, axis=-1) + input.stop_gradient = False + prob.stop_gradient = False + + compat_args = (None,) + + ref_input, ref_prob = self.get_detached_input(input, prob) + ( + ref_dispatched_input, + ref_combine_weights_unnorm, + ref_scatter_index, + ref_dispatch_mask, + _, + ) = moe_gate_dispatch( + ref_input, + ref_prob, + *compat_args, + k=k, + capacity=capacity, + use_pad=True, + ) + + ref_stage_input_list = self.get_stage_input_list( + ref_dispatched_input, world_size, stages + ) + + test_input, test_prob = self.get_detached_input(input, prob) + ( + test_dispatched_input, + test_combine_weights_unnorm, + test_scatter_index, + test_dispatch_mask, + _, + ) = moe_gate_dispatch_permute( + test_input, + test_prob, + *compat_args, + k=k, + capacity=capacity, + world_size=world_size, + ) + + np.testing.assert_equal( + test_dispatched_input.shape, + ref_stage_input_list.shape, + err_msg="moe_permute_ops not match", + ) + np.testing.assert_equal( + test_dispatched_input._md5sum(), + ref_stage_input_list._md5sum(), + err_msg="moe_permute_ops not match", + ) + + +if __name__ == "__main__": + + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py new file mode 100644 index 00000000000..14991becc47 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_incubate_moe_gate_dispatch_w_permute_bwd.py @@ -0,0 +1,175 @@ +# !/usr/bin/env python3 + +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +import paddle.nn.functional as F +from paddle.incubate.nn.functional import ( + moe_gate_dispatch, + moe_gate_dispatch_permute, +) + +batch_size = 4 +hidden_size = 2 +k = 16 +capacity = 2 +num_experts = 16 + +world_size = 2 + + +class TestLayer(paddle.nn.Layer): + def forward(self, x, gate_prob, k, capacity): + y, combine_weights, scatter_index, expert_offset, expert_id = moe_gate_dispatch( + x, gate_prob, None, k, capacity, True + ) + return y, combine_weights, scatter_index, expert_offset, expert_id + + +class TestLayerPermute(paddle.nn.Layer): + def forward(self, x, gate_prob, k, capacity): + ( + y, + combine_weights, + scatter_index, + expert_offset, + expert_id, + ) = moe_gate_dispatch_permute( + x, gate_prob, None, k, capacity, world_size=world_size + ) + return y, combine_weights, scatter_index, expert_offset, expert_id + + +def check_backward_correctness(layer_cls): + paddle.seed(1024) + + dtype = "bfloat16" + layer = layer_cls() + input = paddle.randn([batch_size, hidden_size]) + + gate_weight = paddle.randn([hidden_size, num_experts]) + logits = paddle.matmul(input, gate_weight) + gate_prob = F.softmax(logits, axis=-1) + print(f"gate_prob: {gate_prob}") + + input = paddle.cast(input, "bfloat16") + input.stop_gradient = False + gate_prob.stop_gradient = False + + output, combine_weights, scatter_index, expert_offset, expert_id = layer( + input, gate_prob, k, capacity + ) + + print(f"output: {output}") + print(f"combine_weights: {combine_weights}") + print(f"scatter_index: {scatter_index}") + print(f"expert_offset: {expert_offset}") + print(f"expert_id: {expert_id}") + + # output_g = paddle.randn(output.shape).astype(output.dtype) + # combine_weights_g = paddle.randn(combine_weights.shape).astype(combine_weights.dtype) + output_g = paddle.ones_like(output) + combine_weights_g = paddle.ones_like(combine_weights) + print(f"output_g: {output_g}") + print(f"combine_weights_g: {combine_weights_g}") + + paddle.autograd.backward( + tensors=[output, combine_weights], + grad_tensors=[output_g, combine_weights_g], + ) + # 数值估算 + epsilon = 0.005 + input_numpy = input.detach().astype("float32").numpy() + num_grad = paddle.zeros_like(input) + flattened = num_grad.reshape([-1]) + + for i in range(input.numel()): + input_pos = input_numpy.copy() + input_neg = input_numpy.copy() + input_pos.flat[i] += epsilon + input_neg.flat[i] -= epsilon + + output_pos, _, _, _, _ = layer( + paddle.to_tensor(input_pos), gate_prob, k, capacity + ) + output_neg, _, _, _, _ = layer( + paddle.to_tensor(input_neg), gate_prob, k, capacity + ) + + """ + flattened[i] = (output_pos.astype("float32").numpy() - output_neg.astype("float32").numpy()).sum() / ( + 2 * epsilon + ) + """ + grad_value = (output_pos - output_neg).sum() / (2 * epsilon) + flattened[i] = grad_value + + flattened = flattened.reshape(input.shape) + + print(f"input gradient: {input.grad}") + print(f"numerical gradient: {flattened}") + np.testing.assert_allclose( + input.grad.astype("float32").numpy(), + flattened.astype("float32").numpy(), + rtol=1e-5, + atol=0, + ) + + # 数值估算 gate_prob + epsilon = 0.0005 + gate_prob_numpy = gate_prob.detach().astype("float32").numpy() + num_grad = paddle.zeros_like(gate_prob) + flattened = num_grad.reshape([-1]) + + for i in range(gate_prob.numel()): + input_pos = gate_prob_numpy.copy() + input_neg = gate_prob_numpy.copy() + input_pos.flat[i] += epsilon + input_neg.flat[i] -= epsilon + + _, output_pos, _, _, _ = layer(input, paddle.to_tensor(input_pos), k, capacity) + _, output_neg, _, _, _ = layer(input, paddle.to_tensor(input_neg), k, capacity) + + grad_value = paddle.to_tensor( + (output_pos.numpy() - output_neg.numpy()).sum() / (2 * epsilon) + ) + flattened[i] = grad_value + + flattened = flattened.reshape(gate_prob.shape) + + print(f"gate_prob gradient: {gate_prob.grad}") + print(f"numerical gradient: {flattened}") + np.testing.assert_allclose( + gate_prob.grad.astype("float32").numpy(), + flattened.astype("float32").numpy(), + rtol=1e-4, + atol=0, + ) + + +class TestFused(unittest.TestCase): + def test_moe_backward(self): + check_backward_correctness(TestLayer) + + def test_moe_permute_backward(self): + check_backward_correctness(TestLayerPermute) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_layer_norm.py b/backends/metax_gpu/tests/unit_test/test_layer_norm.py new file mode 100644 index 00000000000..dbeaee31f6c --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_layer_norm.py @@ -0,0 +1,358 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle + +from operator import mul +import paddle.base.core as core +import paddle.nn.functional as F +import paddle.base as base +from functools import reduce +from op_test import _set_use_system_allocator +from paddle.static.amp.fp16_utils import ( + _keep_layer_norm_scale_bias_to_fp32, +) +from paddle.pir_utils import OldIrGuard + +paddle.enable_static() + +np.random.random(123) + +_set_use_system_allocator(True) + + +def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): + x_shape = x.shape + N = reduce(mul, x_shape[0:begin_norm_axis], 1) + D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1) + x.shape = [N, D] + + mean = np.mean(x, axis=1) + var = np.var(x, axis=1) + epsilon + output = np.divide((x - mean.reshape([N, 1])), (np.sqrt(var)).reshape([N, 1])) + if scale is not None: + output = scale.reshape([1, D]) * output + if beta is not None: + output = output + beta.reshape([1, D]) + + x.shape, output.shape = x_shape, x_shape + return output, mean, var + + +def _reference_layer_norm_grad(x, grad_y, scale, bias, mean, var, begin_norm_axis=1): + x_shape = x.shape + N = reduce(mul, x_shape[0:begin_norm_axis], 1) + D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1) + + if scale is not None: + scale_shape = scale.shape + scale.shape = [1, D] + x.shape, grad_y.shape = [N, D], [N, D] + var.shape, mean.shape = [N, 1], [N, 1] + + # d_bias + if bias is not None: + d_bias = np.sum(grad_y, axis=0).reshape([1, D]) + else: + d_bias = None + # d_scale + if scale is not None: + d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y, axis=0).reshape( + [1, D] + ) + else: + d_scale = None + # dx + if scale is not None: + dx_end = scale * np.sqrt(1.0 / var) * grad_y + d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape( + [N, 1] + ) # the second part equals to zero. + d_mean = 1.0 / D * d_mean_0 + d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape( + [N, 1] + ) * (1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) + else: + dx_end = 1.0 * np.sqrt(1.0 / var) * grad_y + d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * 1.0, axis=1).reshape( + [N, 1] + ) # the second part equals to zero. + d_mean = 1.0 / D * d_mean_0 + d_std = np.sum(-(1.0 / var) * (x - mean) * grad_y * 1.0, axis=1).reshape( + [N, 1] + ) * (1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean)) + + grad_x = dx_end + d_mean + d_std + + grad_x.shape, x.shape, grad_y.shape = x_shape, x_shape, x_shape + var.shape, mean.shape = [N], [N] + + if scale is not None: + scale.shape = scale_shape + return grad_x, d_scale, d_bias + + +class TestLayerNormOp(unittest.TestCase): + def setUp(self): + self.init_dtype() + self.place = paddle.CustomPlace("metax_gpu", 0) + self.__class__.use_custom_device = True + + def init_dtype(self): + self.dtype = np.float32 + + def __assert_close(self, tensor, np_array, msg, atol=1e-4): + np.testing.assert_allclose( + np.array(tensor), np_array, rtol=1e-4, atol=atol, err_msg=msg + ) + + def check_forward_backward( + self, + shape, + begin_norm_axis, + has_scale=True, + has_bias=True, + y_grad_scale=1.0, + use_mkldnn=False, + ): + def test_with_place(place, shape, begin_norm_axis, use_mkldnn=use_mkldnn): + # attr + epsilon = 0.00001 + x_shape = shape + D = reduce(mul, x_shape[begin_norm_axis : len(x_shape)], 1) + scale_shape = [D] + + np.random.seed(123) + x = np.random.random_sample(x_shape).astype(self.dtype) + scale = ( + np.random.random_sample(scale_shape).astype(np.float32) + if has_scale + else None + ) + bias = ( + np.random.random_sample(scale_shape).astype(np.float32) + if has_bias + else None + ) + y_grad = (np.random.random_sample(x_shape) * y_grad_scale).astype( + self.dtype + ) + + # reference forward & backward + y, mean, variance = _reference_layer_norm_naive( + x, scale, bias, epsilon, begin_norm_axis + ) + x_grad, scale_grad, bias_grad = _reference_layer_norm_grad( + x, y_grad, scale, bias, mean, variance, begin_norm_axis + ) + mean.shape = x_shape[0:begin_norm_axis] + variance.shape = x_shape[0:begin_norm_axis] + + var_dict = locals() + var_dict["y@GRAD"] = y_grad + var_names = ["x", "mean", "variance", "y", "y@GRAD"] + if has_scale: + var_names += ["scale"] + if has_bias: + var_names += ["bias"] + ground_truth = {name: var_dict[name] for name in var_names} + + with OldIrGuard(): + program = base.Program() + old_program_guard = base.program_guard + with old_program_guard(program): + block = program.global_block() + for name in ground_truth: + block.create_var( + name=name, dtype=self.dtype, shape=ground_truth[name].shape + ) + inputs = {"X": block.var("x")} + fetch_list = [ + "y", + "mean", + "variance", + "x@GRAD", + ] + if has_scale: + inputs["Scale"] = block.var("scale") + fetch_list += ["scale@GRAD"] + if has_bias: + inputs["Bias"] = block.var("bias") + fetch_list += ["bias@GRAD"] + layer_norm_op = block.append_op( + type="layer_norm", + inputs=inputs, + outputs={ + "Y": block.var("y"), + "Mean": block.var("mean"), # share the same memory + "Variance": block.var("variance"), # share the same memory + }, + attrs={ + "epsilon": epsilon, + "begin_norm_axis": begin_norm_axis, + "use_mkldnn": use_mkldnn, + }, + ) + # generate backward op_desc + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( + layer_norm_op.desc, set(), [] + ) + grad_op_desc = grad_op_desc_list[0] + new_op_desc = block.desc.append_op() + new_op_desc.copy_from(grad_op_desc) + for var_name in grad_op_desc.output_arg_names(): + block.desc.var(var_name.encode("ascii")) + grad_op_desc.infer_var_type(block.desc) + grad_op_desc.infer_shape(block.desc) + for arg in grad_op_desc.output_arg_names(): + grad_var = block.desc.find_var(arg.encode("ascii")) + grad_var.set_dtype(core.VarDesc.VarType.FP32) + + program._sync_with_cpp() + exe = base.Executor(place) + with OldIrGuard(): + out = exe.run( + program, + feed={ + name: var_dict[name] + for name in ["x", "scale", "bias", "y@GRAD"] + }, + fetch_list=fetch_list, + ) + + self.__assert_close(y, out[0], "y") + self.__assert_close(mean, out[1], "mean") + self.__assert_close(variance, out[2], "variance", 1e-3) + self.__assert_close(x_grad, out[3], "x_grad") + if has_scale: + self.__assert_close( + scale_grad.reshape(-1), + out[fetch_list.index("scale@GRAD")], + "scale_grad", + 1e-3, + ) + if has_bias: + self.__assert_close( + bias_grad.reshape(-1), + out[fetch_list.index("bias@GRAD")], + "bias_grad", + ) + + test_with_place(self.place, shape, begin_norm_axis) + + def test_check_forward_backward_with_scale_and_bias(self): + self.check_forward_backward(shape=[1, 3, 4, 5], begin_norm_axis=1) + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1) + self.check_forward_backward( + shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=False, has_bias=True + ) + self.check_forward_backward( + shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=True, has_bias=False + ) + self.check_forward_backward( + shape=[2, 3, 4, 5], begin_norm_axis=1, has_scale=False, has_bias=False + ) + self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3) + self.check_forward_backward( + shape=[92, 513, 129], begin_norm_axis=2, y_grad_scale=0.1 + ) + self.check_forward_backward(shape=[3, 34, 1134], begin_norm_axis=2) + self.check_forward_backward( + shape=[92, 513, 1134], begin_norm_axis=2, y_grad_scale=0.1 + ) + self.check_forward_backward( + shape=[92, 513, 1134], + begin_norm_axis=2, + has_scale=False, + has_bias=True, + y_grad_scale=0.1, + ) + self.check_forward_backward( + shape=[92, 513, 1134], + begin_norm_axis=2, + has_scale=True, + has_bias=False, + y_grad_scale=0.1, + ) + self.check_forward_backward( + shape=[92, 513, 1134], + begin_norm_axis=2, + has_scale=False, + has_bias=False, + y_grad_scale=0.1, + ) + self.check_forward_backward( + shape=[512, 1024], begin_norm_axis=1, has_scale=True, has_bias=True + ) + + +class TestFP16ScaleBiasLayerNorm(unittest.TestCase): + def check_main(self, x_np, weight_np, bias_np, dtype): + paddle.disable_static() + + weight_np = weight_np.astype(dtype) + bias_np = bias_np.astype(dtype) + + x = paddle.to_tensor(x_np) + weight = paddle.to_tensor(weight_np) + bias = paddle.to_tensor(bias_np) + x.stop_gradient = False + weight.stop_gradient = False + bias.stop_gradient = False + y = F.layer_norm(x, x.shape[1:], weight, bias) + x_g, w_g, b_g = paddle.grad(y, [x, weight, bias]) + y_np = y.numpy().astype("float32") + x_g_np = x_g.numpy().astype("float32") + w_g_np = w_g.numpy().astype("float16") + b_g_np = b_g.numpy().astype("float32") + + paddle.enable_static() + return y_np, x_g_np, w_g_np, b_g_np + + def test_main(self): + paddle.set_device("metax_gpu") + x_np = np.random.random([10, 20]).astype("float16") + weight_np = np.random.random([20]).astype("float16") + bias_np = np.random.random([20]).astype("float16") + + y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main( + x_np, weight_np, bias_np, "float16" + ) + y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main( + x_np, weight_np, bias_np, "float32" + ) + + def assert_equal(x, y): + np.testing.assert_allclose(x, y) + + assert_equal(y_np_1, y_np_2) + assert_equal(x_g_np_1, x_g_np_2) + assert_equal(w_g_np_1, w_g_np_2) + assert_equal(b_g_np_1, b_g_np_2) + + +class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase): + def test_main(self): + self.assertTrue(_keep_layer_norm_scale_bias_to_fp32()) + _keep_layer_norm_scale_bias_to_fp32(False) + self.assertFalse(_keep_layer_norm_scale_bias_to_fp32()) + _keep_layer_norm_scale_bias_to_fp32(True) + self.assertTrue(_keep_layer_norm_scale_bias_to_fp32()) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py b/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py new file mode 100644 index 00000000000..7545e16d14d --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_matmul_op__metax.py @@ -0,0 +1,395 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +from tests.op_test import OpTest +import paddle + +paddle.enable_static() +SEED = 2022 + + +def reference_matmul(X, Y, transpose_X=False, transpose_Y=False, scale=1.0): + """Reference forward implementation using np.matmul.""" + # np.matmul does not support the transpose flags, so we manually + # transpose X and Y appropriately. + if transpose_X: + if X.ndim == 1: + X = X.reshape((X.size,)) + elif X.ndim == 2: + X = X.T + else: + dim = [i for i in range(len(X.shape))] + dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] + X = np.transpose(X, tuple(dim)) + if transpose_Y: + if Y.ndim == 1: + Y = Y.reshape((Y.size,)) + else: + dim = [i for i in range(len(Y.shape))] + dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] + Y = np.transpose(Y, tuple(dim)) + + Out = np.matmul(X, Y) + if abs(scale - 1.0) > 1e-09: + Out = Out * scale + return Out + + +class TestBmmOp(OpTest): + """ + case 0 + """ + + def set_metax_gpu(self): + self.__class__.use_custom_device = True + self.place = paddle.CustomPlace("metax_gpu", 0) + + def config(self): + self.x_shape = (10, 2, 5) + self.y_shape = (10, 5, 8) + + def init_kernel_type(self): + self.dtype = "float32" + + def setUp(self): + self.set_metax_gpu() + self.init_kernel_type() + self.config() + self.op_type = "bmm" + x = np.random.random(self.x_shape).astype(self.dtype) + y = np.random.random(self.y_shape).astype(self.dtype) + # -0.1 ~ 0.1 + x = -0.1 + 0.2 * x + y = -0.1 + 0.2 * y + result = reference_matmul(x, y) + result = result.astype(self.dtype) + self.inputs = { + "X": x, + "Y": y, + } + self.outputs = {"Out": result} + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X", "Y"], "Out") + + +class TestBmmOp1(TestBmmOp): + """ + case 1 + """ + + def config(self): + self.x_shape = (40, 10, 10) + self.y_shape = (40, 10, 10) + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X", "Y"], "Out") + + +class TestBmmOp2(TestBmmOp): + """ + case 2 + """ + + def config(self): + self.x_shape = (4, 10, 80) + self.y_shape = (4, 80, 1) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, + ["X", "Y"], + "Out", + max_relative_error=1e-2, + ) + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + +class TestMatMulOp(OpTest): + """ + basic case + """ + + def setUp(self): + self.set_metax_gpu() + self.op_type = "matmul_v2" + self.init_dtype() + self.init_alpha() + self.config() + + X = np.random.random(self.x_shape).astype(self.dtype) + Y = np.random.random(self.y_shape).astype(self.dtype) + # -0.1 ~ 0.1 + X = -0.1 + 0.2 * X + Y = -0.1 + 0.2 * Y + Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y, self.alpha) + Out = Out.astype(self.dtype) + self.inputs = {"X": X, "Y": Y} + self.attrs = { + "trans_x": self.transpose_X, + "trans_y": self.transpose_Y, + "alpha": self.alpha, + } + self.outputs = {"Out": Out} + + def set_metax_gpu(self): + self.__class__.use_custom_device = True + self.place = paddle.CustomPlace("metax_gpu", 0) + + def config(self): + self.x_shape = (100,) + self.y_shape = (100,) + self.transpose_X = False + self.transpose_Y = False + + def init_alpha(self): + self.alpha = 1.0 + + def init_dtype(self): + self.dtype = "float32" + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-7) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ["X", "Y"], "Out") + + +class TestMatMulOp1(TestMatMulOp): + """ + case x_ndim == 1, y_ndim != 1 + """ + + def config(self): + self.x_shape = (100,) + self.y_shape = (1, 3, 2, 100) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp2(TestMatMulOp): + """ + case x_ndim != 1, y_ndim == 1 + """ + + def config(self): + self.x_shape = (1, 2, 100, 1) + self.y_shape = (100,) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp3(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (2, 100) + self.y_shape = (100, 2) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp4(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (2, 100) + self.y_shape = (2, 100) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp5(TestMatMulOp): + """ + case [M, K] x [K, N] = [M, N] + """ + + def config(self): + self.x_shape = (100, 2) + self.y_shape = (100, 2) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp6(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 2, 25) + self.y_shape = (25, 4) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp7(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (1, 4, 25) + self.y_shape = (4, 25) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp8(TestMatMulOp): + """ + case [B, M, K] x [K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (1, 25, 4) + self.y_shape = (25, 4) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp9(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 5, 10) + self.y_shape = (2, 10, 5) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp10(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 10, 5) + self.y_shape = (2, 10, 5) + self.transpose_X = True + self.transpose_Y = False + + +class TestMatMulOp11(TestMatMulOp): + """ + case [B, M, K] x [B, K, N] = [B, M, N] + """ + + def config(self): + self.x_shape = (2, 5, 10) + self.y_shape = (2, 5, 10) + self.transpose_X = False + self.transpose_Y = True + + +class TestMatMulOp12(TestMatMulOp): + """ + case to check the gradient for special case + """ + + def config(self): + self.x_shape = 100 + self.y_shape = (1, 2, 2, 100, 2) + self.transpose_X = False + self.transpose_Y = False + + +class TestMatMulOp13(TestMatMulOp): + """ + case to check the gradient for special case + """ + + def config(self): + self.x_shape = (2, 1, 100) + self.y_shape = 100 + self.transpose_X = False + self.transpose_Y = False + + +# TODO(metax_gpu): alpha will be supported in next version +# --------------------test matmul alpha-------------------- +# def create_test_alpha_class(parent): +# class TestMatMulOpAlphaCase(parent): +# def init_alpha(self): +# self.alpha = 0.125 + +# cls_name = "{0}_{1}".format(parent.__name__, "Alpha") +# TestMatMulOpAlphaCase.__name__ = cls_name +# globals()[cls_name] = TestMatMulOpAlphaCase + +# create_test_alpha_class(TestMatMulOp) +# create_test_alpha_class(TestMatMulOp1) +# create_test_alpha_class(TestMatMulOp2) +# create_test_alpha_class(TestMatMulOp3) +# create_test_alpha_class(TestMatMulOp4) +# create_test_alpha_class(TestMatMulOp5) +# create_test_alpha_class(TestMatMulOp6) +# create_test_alpha_class(TestMatMulOp9) +# create_test_alpha_class(TestMatMulOp10) +# create_test_alpha_class(TestMatMulOp11) +# create_test_alpha_class(TestMatMulOp12) +# create_test_alpha_class(TestMatMulOp13) + + +# --------------------test matmul fp16-------------------- +def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5): + class TestMatMulOpFp16Case(parent): + def init_kernel_type(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=atol) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ["X", "Y"], "Out", max_relative_error=max_relative_error + ) + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestMatMulOpFp16Case.__name__ = cls_name + globals()[cls_name] = TestMatMulOpFp16Case + + +create_test_fp16_class(TestMatMulOp) +create_test_fp16_class(TestMatMulOp1) +create_test_fp16_class(TestMatMulOp2) +create_test_fp16_class(TestMatMulOp3) +create_test_fp16_class(TestMatMulOp4) +create_test_fp16_class(TestMatMulOp5) +create_test_fp16_class(TestMatMulOp6) +create_test_fp16_class(TestMatMulOp9) +create_test_fp16_class(TestMatMulOp10) +create_test_fp16_class(TestMatMulOp11) +create_test_fp16_class(TestMatMulOp12) +create_test_fp16_class(TestMatMulOp13) + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py b/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py new file mode 100644 index 00000000000..c9bccd2abb3 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_nonzero_api_metax.py @@ -0,0 +1,220 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from op_test import OpTest, convert_float_to_uint16 + +import paddle +from paddle import base +from paddle.base import Program, program_guard + + +def call_nonzero(x): + input = paddle.to_tensor(x) + return paddle.nonzero(x=input) + + +class TestNonZeroAPI(unittest.TestCase): + def test_nonzero_api_as_tuple(self): + paddle.enable_static() + data = np.array([[1, 0], [0, 1]], dtype="float32") + with program_guard(Program(), Program()): + x = paddle.static.data(name="x", shape=[-1, 2], dtype="float32") + if not paddle.framework.use_pir_api(): + x.desc.set_need_check_feed(False) + y = paddle.nonzero(x, as_tuple=True) + self.assertEqual(type(y), tuple) + self.assertEqual(len(y), 2) + z = paddle.concat(list(y), axis=0) + exe = base.Executor(base.CPUPlace()) + + (res,) = exe.run(feed={"x": data}, fetch_list=[z], return_numpy=False) + expect_out = np.array([0, 1, 0, 1]) + np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05) + + data = np.array([1, 1, 0], dtype="float32") + with program_guard(Program(), Program()): + x = paddle.static.data(name="x", shape=[-1], dtype="float32") + if not paddle.framework.use_pir_api(): + x.desc.set_need_check_feed(False) + y = paddle.nonzero(x, as_tuple=True) + self.assertEqual(type(y), tuple) + self.assertEqual(len(y), 1) + z = paddle.concat(list(y), axis=0) + exe = base.Executor(base.CPUPlace()) + (res,) = exe.run(feed={"x": data}, fetch_list=[z], return_numpy=False) + expect_out = np.array([0, 1]) + np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05) + + data = np.zeros([10, 3, 0], dtype="float32") + with program_guard(Program(), Program()): + x = paddle.static.data(name="x", shape=[10, 3, 0], dtype="float32") + if not paddle.framework.use_pir_api(): + x.desc.set_need_check_feed(False) + y = paddle.nonzero(x, as_tuple=True) + self.assertEqual(type(y), tuple) + self.assertEqual(len(y), 3) + expect_out = np.zeros([0]) + for item in y: + np.testing.assert_array_equal(expect_out, item) + + def test_nonzero_api(self): + paddle.enable_static() + data = np.array([[1, 0], [0, 1]], dtype="float32") + with program_guard(Program(), Program()): + x = paddle.static.data(name="x", shape=[-1, 2], dtype="float32") + if not paddle.framework.use_pir_api(): + x.desc.set_need_check_feed(False) + y = paddle.nonzero(x) + exe = base.Executor(base.CPUPlace()) + (res,) = exe.run(feed={"x": data}, fetch_list=[y], return_numpy=False) + expect_out = np.array([[0, 0], [1, 1]]) + np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05) + + data = np.array([1, 1, 0], dtype="float32") + with program_guard(Program(), Program()): + x = paddle.static.data(name="x", shape=[-1], dtype="float32") + if not paddle.framework.use_pir_api(): + x.desc.set_need_check_feed(False) + y = paddle.nonzero(x) + exe = base.Executor(base.CPUPlace()) + (res,) = exe.run(feed={"x": data}, fetch_list=[y], return_numpy=False) + expect_out = np.array([[0], [1]]) + np.testing.assert_allclose(expect_out, np.array(res), rtol=1e-05) + + def test_dygraph_api(self): + data_x = np.array([[True, False], [False, True]]) + with base.dygraph.guard(): + x = paddle.to_tensor(data_x) + z = paddle.nonzero(x) + np_z = z.numpy() + expect_out = np.array([[0, 0], [1, 1]]) + + +# Base case +class TestNonzeroOp(OpTest): + def setUp(self): + """Test where_index op with random value""" + np.random.seed(2023) + self.op_type = "where_index" + self.python_api = call_nonzero + self.init_shape() + self.init_dtype() + + self.inputs = self.create_inputs() + self.outputs = self.return_outputs() + + def test_check_output(self): + self.check_output(check_pir=True, check_symbol_infer=False) + + def init_shape(self): + self.shape = [8, 8] + + def init_dtype(self): + self.dtype = np.float64 + + def create_inputs(self): + return {"Condition": np.random.randint(5, size=self.shape).astype(self.dtype)} + + def return_outputs(self): + return {"Out": np.transpose(np.nonzero(self.inputs["Condition"]))} + + +class TestNonzeroComplex64Op(TestNonzeroOp): + def init_shape(self): + self.shape = [1, 2, 3] + + def init_dtype(self): + self.dtype = np.complex64 + + +class TestNonzeroComplex128Op(TestNonzeroOp): + def init_shape(self): + self.shape = [1, 2, 3] + + def init_dtype(self): + self.dtype = np.complex128 + + +class TestNonzeroFP32Op(TestNonzeroOp): + def init_shape(self): + self.shape = [2, 10, 2] + + def init_dtype(self): + self.dtype = np.float32 + + +class TestNonzeroFP16Op(TestNonzeroOp): + def init_shape(self): + self.shape = [3, 4, 7] + + def init_dtype(self): + self.dtype = np.float16 + + +class TestNonzeroBF16(OpTest): + def setUp(self): + """Test where_index op with bfloat16 dtype""" + np.random.seed(2023) + self.op_type = "where_index" + self.python_api = call_nonzero + self.init_shape() + self.init_dtype() + + self.inputs = self.create_inputs() + self.outputs = self.return_outputs() + + def test_check_output(self): + self.check_output(check_pir=True, check_symbol_infer=False) + + def init_shape(self): + self.shape = [12, 9] + + def init_dtype(self): + self.dtype = np.uint16 + + def create_inputs(self): + return { + "Condition": convert_float_to_uint16( + np.random.randint(5, size=self.shape).astype(np.float32) + ) + } + + def return_outputs(self): + return {"Out": np.transpose(np.nonzero(self.inputs["Condition"]))} + + +class TestZeroSizeOp(TestNonzeroOp): + def init_shape(self): + self.shape = [0, 10] + + def init_dtype(self): + self.dtype = np.float64 + + +class TestZeroSizeOpCase2(TestNonzeroOp): + def init_shape(self): + self.shape = [0, 10] + + def init_dtype(self): + self.dtype = np.float64 + + def test_check_output(self): + self.check_output(check_pir=True, check_symbol_infer=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py b/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py new file mode 100644 index 00000000000..c1bc46517b6 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_p_norm_op_metax.py @@ -0,0 +1,215 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +from tests.op_test import OpTest + +paddle.enable_static() + + +def p_norm(x, axis, porder, keepdims=False, reduce_all=False): + r = [] + if axis is None or reduce_all: + x = x.flatten() + if porder == np.inf: + r = np.amax(np.abs(x), keepdims=keepdims) + elif porder == -np.inf: + r = np.amin(np.abs(x), keepdims=keepdims) + else: + r = np.linalg.norm(x, ord=porder, keepdims=keepdims) + elif isinstance(axis, list or tuple) and len(axis) == 2: + if porder == np.inf: + axis = tuple(axis) + r = np.amax(np.abs(x), axis=axis, keepdims=keepdims) + elif porder == -np.inf: + axis = tuple(axis) + r = np.amin(np.abs(x), axis=axis, keepdims=keepdims) + elif porder == 0: + axis = tuple(axis) + r = x.astype(bool) + r = np.sum(r, axis, keepdims=keepdims) + elif porder == 1: + axis = tuple(axis) + r = np.sum(np.abs(x), axis, keepdims=keepdims) + else: + axis = tuple(axis) + xp = np.power(np.abs(x), porder) + s = np.sum(xp, axis=axis, keepdims=keepdims) + r = np.power(s, 1.0 / porder) + else: + if isinstance(axis, list): + axis = tuple(axis) + r = np.linalg.norm(x, ord=porder, axis=axis, keepdims=keepdims) + r = r.astype(x.dtype) + + return r + + +class TestPnormOp(OpTest): + def set_metax_gpu(self): + self.__class__.use_custom_device = True + + def setUp(self): + self.set_metax_gpu() + self.op_type = "p_norm" + self.init_test_case() + x = (np.random.random(self.shape) + 0.5).astype(self.dtype) + norm = p_norm(x, self.axis, self.porder, self.keepdim) + self.inputs = {"X": x} + self.attrs = { + "epsilon": self.epsilon, + "axis": self.axis, + "keepdim": self.keepdim, + "porder": float(self.porder), + } + self.outputs = {"Out": norm} + self.gradient = self.calc_gradient() + + def test_check_output(self): + if self.dtype == "float16": + self.check_output_with_place(paddle.CustomPlace("metax_gpu", 0), atol=5e-3) + else: + self.check_output_with_place(paddle.CustomPlace("metax_gpu", 0)) + + def test_check_grad(self): + self.check_grad_with_place( + paddle.CustomPlace("metax_gpu", 0), + ["X"], + "Out", + user_defined_grads=self.gradient, + ) + + def init_test_case(self): + self.shape = [2, 3, 4, 5] + self.axis = 1 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = False + self.init_dtype() + + def init_dtype(self): + self.dtype = "float32" + + def calc_gradient(self): + self.attrs = { + "epsilon": self.epsilon, + "axis": self.axis, + "keepdim": self.keepdim, + "porder": float(self.porder), + } + x = self.inputs["X"] + porder = self.attrs["porder"] + axis = self.attrs["axis"] + if porder == 0: + grad = np.zeros(x.shape).astype(x.dtype) + elif porder in [float("inf"), float("-inf")]: + norm = p_norm(x, axis=axis, porder=porder, keepdims=True) + x_abs = np.abs(x) + grad = np.sign(x) + grad[x_abs != norm] = 0.0 + else: + norm = p_norm(x, axis=axis, porder=porder, keepdims=True) + grad = ( + np.power(norm, 1 - porder) + * np.power(np.abs(x), porder - 1) + * np.sign(x) + ) + + numel = 1 + for s in x.shape: + numel *= s + numel /= x.shape[axis] + return [grad.astype(x.dtype) * 1 / numel] + + +class TestPnormOp2(TestPnormOp): + def init_test_case(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.epsilon = 1e-12 + self.porder = 2.0 + self.keepdim = True + self.init_dtype() + + +# class TestPnormOp3(TestPnormOp): +# def init_test_case(self): +# self.shape = [3, 20, 3] +# self.axis = 2 +# self.epsilon = 1e-12 +# self.porder = np.inf +# self.keepdim = True +# self.init_dtype() + + +# class TestPnormOp4(TestPnormOp3): +# def init_test_case(self): +# self.shape = [3, 20, 3] +# self.axis = 2 +# self.epsilon = 1e-12 +# self.porder = -np.inf +# self.keepdim = True +# self.init_dtype() + + +class TestPnormOp5(TestPnormOp): + def init_test_case(self): + self.shape = [3, 20, 3] + self.axis = 2 + self.epsilon = 1e-12 + self.porder = 0 + self.keepdim = True + self.init_dtype() + + +# class TestPnormOp6(TestPnormOp): +# def init_test_case(self): +# self.shape = [2, 3, 4, 5] +# self.axis = 1 +# self.epsilon = 1e-12 +# self.porder = 0.5 +# self.keepdim = False +# self.init_dtype() + + +class TestPnormOpfp16(TestPnormOp): + def init_dtype(self): + self.dtype = "float16" + + +class TestPnormOp2fp16(TestPnormOp2): + def init_dtype(self): + self.dtype = "float16" + + +# class TestPnormOp3fp16(TestPnormOp3): +# def init_dtype(self): +# self.dtype = "float16" + + +# class TestPnormOp4fp16(TestPnormOp4): +# def init_dtype(self): +# self.dtype = "float16" + + +class TestPnormOp5fp16(TestPnormOp5): + def init_dtype(self): + self.dtype = "float16" + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py b/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py new file mode 100644 index 00000000000..c67e807397c --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_squeeze_op_metax.py @@ -0,0 +1,125 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest + +# import sys + +# sys.path.append("..") + +import numpy as np + +import paddle +from tests.op_test import OpTest + +paddle.enable_static() + + +# Correct: General. +class TestSqueezeOp(OpTest): + def setUp(self): + self.op_type = "squeeze2" + self.init_test_case() + self.set_metax_gpu() + self.inputs = {"X": np.random.random(self.ori_shape).astype("float64")} + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + } + + def set_metax_gpu(self): + self.__class__.use_custom_device = True + self.place = paddle.CustomPlace("metax_gpu", 0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.ori_shape = (1, 3, 1, 40) + self.axes = (0, 2) + self.new_shape = (3, 40) + + def init_attrs(self): + self.attrs = {"axes": self.axes} + + +# class TestSqueezeBF16Op(OpTest): +# def setUp(self): +# self.op_type = "squeeze2" +# self.dtype = np.uint16 +# self.init_test_case() +# self.set_metax_gpu() +# x = np.random.random(self.ori_shape).astype("float32") +# out = x.reshape(self.new_shape) +# self.inputs = {"X": convert_float_to_uint16(x)} +# self.init_attrs() +# self.outputs = {"Out": convert_float_to_uint16(out)} + +# def set_metax_gpu(self): +# self.__class__.use_custom_device = True +# self.place = paddle.CustomPlace("metax_gpu", 0) + +# def test_check_output(self): +# self.check_output() + +# def test_check_grad(self): +# self.check_grad(["X"], "Out") + +# def init_test_case(self): +# self.ori_shape = (1, 3, 1, 40) +# self.axes = (0, 2) +# self.new_shape = (3, 40) + +# def init_attrs(self): +# self.attrs = {"axes": self.axes} + + +# Correct: There is mins axis. +class TestSqueezeOp1(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (1, 3, 1, 40) + self.axes = (0, -2) + self.new_shape = (3, 40) + + +# Correct: No axes input. +class TestSqueezeOp2(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (1, 20, 1, 5) + self.axes = () + self.new_shape = (20, 5) + + +# Correct: Just part of axes be squeezed. +class TestSqueezeOp3(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (6, 1, 5, 1, 4, 1) + self.axes = (1, -1) + self.new_shape = (6, 5, 1, 4) + + +# Correct: The demension of axis is not of size 1 remains unchanged. +class TestSqueezeOp4(TestSqueezeOp): + def init_test_case(self): + self.ori_shape = (6, 1, 5, 1, 4, 1) + self.axes = (1, 2) + self.new_shape = (6, 5, 1, 4, 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py b/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py new file mode 100644 index 00000000000..40e46e70a21 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_swiglu_metax.py @@ -0,0 +1,295 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np +from op_test import OpTest + +import paddle +import paddle.distributed as dist +import paddle.nn.functional as F +from paddle import _C_ops +from paddle.base import core +from paddle.distributed.auto_parallel.static.dist_attribute import ( + DistTensorSpec, + TensorDistAttr, +) +from paddle.incubate.nn.functional import swiglu as fused_swiglu_impl + + +def swiglu(x, y, out_grad): + if isinstance(x, np.ndarray): + x = paddle.to_tensor(x) + y = paddle.to_tensor(y) + out_grad = paddle.to_tensor(out_grad) + + origin_x = x.detach().clone() + origin_x.stop_gradient = False + x = origin_x + + origin_y = y.detach().clone() + origin_y.stop_gradient = False + y = origin_y + + dtype = x.dtype + need_convert = False + assert dtype == y.dtype + output_dtype = dtype + if paddle.is_compiled_with_cuda(): + if dtype in [paddle.float16, paddle.bfloat16]: + output_dtype = paddle.float32 + x = x.astype(output_dtype) + y = y.astype(output_dtype) + need_convert = True + + out = F.silu(x) * y + if need_convert: + out = out.astype(dtype) + out.backward(out_grad) + ret = [ + out.astype(output_dtype), + origin_x.grad.astype(output_dtype), + origin_y.grad.astype(output_dtype), + ] + return ret + + +def fused_swiglu(x, y, out_grad): + x = x.detach().clone() + x.stop_gradient = False + if y is not None: + y = y.detach().clone() + y.stop_gradient = False + out = fused_swiglu_impl(x, y) + out.backward(out_grad) + + output_dtype = x.dtype + if paddle.is_compiled_with_cuda(): + if x.dtype in [paddle.float16, paddle.bfloat16]: + output_dtype = paddle.float32 + ret = [ + out.astype(output_dtype), + ] + if y is not None: + x_grad, y_grad = x.grad, y.grad + else: + x_grad, y_grad = paddle.split(x.grad, 2, axis=-1) + + ret.append(x_grad.astype(output_dtype)) + ret.append(y_grad.astype(output_dtype)) + return ret + + +tol_map = { + paddle.float64: [1e-8, 1e-8], + paddle.float32: [1e-6, 1e-6], + paddle.float16: [1e-3, 1e-3], + paddle.bfloat16: [1e-3, 1e-3], +} + + +class TestSwiGLUDygraph(unittest.TestCase): + def check_dygraph_impl(self, device, shape, dtype): + x = paddle.randn(shape, dtype=dtype) + y = paddle.randn(shape, dtype=dtype) + out_grad = paddle.randn(shape, dtype=dtype) + + ret1 = swiglu(x, y, out_grad) + ret2 = fused_swiglu(x, y, out_grad) + ret3 = fused_swiglu(paddle.concat([x, y], axis=-1), None, out_grad) + + atol, rtol = tol_map[dtype] + err_msg = f"Failed when device = {device}, dtype = {dtype}, shape = {shape}" + for t1, t2, t3 in zip(ret1, ret2, ret3): + t1, t2, t3 = t1.numpy(), t2.numpy(), t3.numpy() + np.testing.assert_allclose(t1, t2, atol=atol, rtol=rtol, err_msg=err_msg) + np.testing.assert_equal(t2, t3, err_msg=err_msg) + + def check_dygraph(self, shape): + metas = [("cpu", paddle.float32), ("cpu", paddle.float64)] + if paddle.is_compiled_with_cuda(): + metas.append(("gpu", paddle.float32)) + metas.append(("gpu", paddle.float64)) + metas.append(("gpu", paddle.float16)) + prop = paddle.device.cuda.get_device_properties() + if prop.major >= 8: + metas.append(("gpu", paddle.bfloat16)) + + for device, dtype in metas: + origin_device = paddle.get_device() + paddle.set_device(device) + for with_split in [True]: + self.check_dygraph_impl(device, shape, dtype) + paddle.set_device(origin_device) + + def check_static_graph(self, shape, dtype="float32"): + x = paddle.static.data(name="x", shape=shape, dtype=dtype) + y = paddle.static.data(name="y", shape=shape, dtype=dtype) + concated_x = paddle.static.data( + name="concated_x", + shape=[*shape[:-1], shape[-1] * 2], + dtype=dtype, + ) + out1 = fused_swiglu_impl(x, y) + out2 = fused_swiglu_impl(concated_x) + + concated_x_np = np.random.random(concated_x.shape).astype(dtype) + x_np, y_np = np.split(concated_x_np, 2, axis=-1) + + exe = paddle.static.Executor() + t1, t2 = exe.run( + feed={"x": x_np, "y": y_np, "concated_x": concated_x_np}, + fetch_list=[out1, out2], + ) + np.testing.assert_equal(t1, t2) + + def check_main(self, shape): + self.check_dygraph(shape) + paddle.enable_static() + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + self.check_static_graph(shape) + paddle.disable_static() + + def test_main(self): + self.check_main([8, 100]) + self.check_main([4, 101]) + + +class TestSwigluOp(OpTest): + def config(self): + self.x_shape = (8, 128) + self.check_auto_parallel = True + + def setUp(self): + self.config() + self.op_type = "swiglu" + self.prim_op_type = "comp" + self.python_api = fused_swiglu_impl + self.public_python_api = fused_swiglu_impl + x = np.random.uniform(-1, 1, self.x_shape).astype("float64") + y = np.random.uniform(-1, 1, self.x_shape).astype("float64") + out_grad = np.random.uniform(-1, 1, self.x_shape).astype("float64") + res = swiglu(x, y, out_grad) + self.inputs = {"x": x, "y": y} + self.outputs = {"out": res[0].numpy()} + self.placements = { + "x": [dist.Shard(1)], + "y": [dist.Shard(1)], + "out": [dist.Shard(1)], + } + + def test_check_output(self): + self.check_output(check_prim_pir=True) + + def test_check_grad(self): + self.check_grad( + ["x", "y"], + "out", + check_auto_parallel=self.check_auto_parallel, + check_dygraph=1, + check_prim_pir=True, + ) + + +class TestSwigluOp2(TestSwigluOp): + def setUp(self): + self.config() + self.op_type = "swiglu" + self.prim_op_type = "comp" + self.python_api = fused_swiglu_impl + self.public_python_api = fused_swiglu_impl + x = np.random.uniform(-1, 1, self.x_shape).astype("float64") + tmp_inputs = np.split(x, 2, axis=-1) + x = tmp_inputs[0] + y = tmp_inputs[1] + out_grad = np.random.uniform(-1, 1, x.shape).astype("float64") + res = swiglu(x, y, out_grad) + self.inputs = {"x": x, "y": y} + self.outputs = {"out": res[0].numpy()} + self.placements = { + "x": [dist.Shard(1)], + "y": [dist.Shard(1)], + "out": [dist.Shard(1)], + } + + +@unittest.skipIf( + not paddle.base.core.is_compiled_with_dist(), + "The spmd rule is should be tested with distributed=ON", +) +class TestSwigluSpmd(unittest.TestCase): + def setUp(self): + self.kernel = "swiglu" + self.rule = paddle.base.core.get_phi_spmd_rule(self.kernel) + x_shape = [64, 32] + process_mesh = dist.ProcessMesh(mesh=[0, 1, 2, 3]) + x_tensor_dist_attr = TensorDistAttr() + x_tensor_dist_attr.dims_mapping = [-1, 0] + x_tensor_dist_attr.process_mesh = process_mesh + self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + self.y_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + self.out_dist_tensor_spec = DistTensorSpec(self.x_dist_tensor_spec) + + def test_input_x_y(self): + result_dist_attrs = self.rule.infer_forward( + self.x_dist_tensor_spec, self.y_dist_tensor_spec + ) + inferred_input_dist_attrs = result_dist_attrs[0] + inferred_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(inferred_input_dist_attrs), 2) + self.assertEqual(len(inferred_output_dist_attrs), 1) + self.assertEqual(inferred_output_dist_attrs[0].dims_mapping, [-1, 0]) + + def test_input_x_unshard_last_dim(self): + x_shape = [64, 32] + process_mesh = dist.ProcessMesh(mesh=[0, 1, 2, 3]) + x_tensor_dist_attr = TensorDistAttr() + x_tensor_dist_attr.dims_mapping = [0, -1] + x_tensor_dist_attr.process_mesh = process_mesh + self.x_dist_tensor_spec = DistTensorSpec(x_shape, x_tensor_dist_attr) + + result_dist_attrs = self.rule.infer_forward( + self.x_dist_tensor_spec, DistTensorSpec() + ) + inferred_input_dist_attrs = result_dist_attrs[0] + inferred_output_dist_attrs = result_dist_attrs[1] + self.assertEqual(len(result_dist_attrs), 2) + self.assertEqual(len(inferred_input_dist_attrs), 2) + self.assertEqual(len(inferred_output_dist_attrs), 1) + self.assertEqual(inferred_output_dist_attrs[0].dims_mapping, [0, -1]) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), "mamtul 0 size only with in cuda") +class TestSwiglu0SizeDygraph(unittest.TestCase): + def test_swiglu(self): + x = paddle.ones([0, 128], dtype="float32") + y = paddle.ones([0, 128], dtype="float32") + x.stop_gradient = False + y.stop_gradient = False + out = fused_swiglu_impl(x, y) + + dz = paddle.ones([0, 128], dtype="float32") + + out = _C_ops.swiglu_grad(x, y, dz) + + self.assertEqual(out[0].shape, x.shape) + self.assertEqual(out[1].shape, y.shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py b/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py new file mode 100644 index 00000000000..4369972255d --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_top_p_sampling.py @@ -0,0 +1,162 @@ +# 2024 - Modified by MetaX Integrated Circuits (Shanghai) Co., Ltd. All Rights Reserved. +# # Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle + + +def TopPProcess(probs, top_p): + sorted_probs = paddle.sort(probs, descending=True) + sorted_indices = paddle.argsort(probs, descending=True) + cumulative_probs = paddle.cumsum(sorted_probs, axis=-1) + + # Remove tokens with cumulative probs above the top_p, But keep at + # least min_tokens_to_keep tokens + sorted_indices_to_remove = cumulative_probs > top_p + + # Keep the first token + sorted_indices_to_remove = paddle.cast(sorted_indices_to_remove, dtype="int64") + + sorted_indices_to_remove = paddle.static.setitem( + sorted_indices_to_remove, + (slice(None), slice(1, None)), + sorted_indices_to_remove[:, :-1].clone(), + ) + sorted_indices_to_remove = paddle.static.setitem( + sorted_indices_to_remove, (slice(None), 0), 0 + ) + + # Scatter sorted tensors to original indexing + sorted_indices = ( + sorted_indices + paddle.arange(probs.shape[0]).unsqueeze(-1) * probs.shape[-1] + ) + condition = paddle.scatter( + sorted_indices_to_remove.flatten(), + sorted_indices.flatten(), + sorted_indices_to_remove.flatten(), + ) + condition = paddle.cast(condition, "bool").reshape(probs.shape) + probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs) + next_tokens = paddle.multinomial(probs) + next_scores = paddle.index_sample(probs, next_tokens) + return next_scores, next_tokens + + +class TestTopPAPI(unittest.TestCase): + def setUp(self): + self.topp = 0.0 + self.seed = 6688 + self.batch_size = 3 + self.vocab_size = 10000 + self.dtype = "float32" + self.input_data = np.random.rand(self.batch_size, self.vocab_size) + + def run_dygraph(self, place): + with paddle.base.dygraph.guard(place): + input_tensor = paddle.to_tensor(self.input_data, self.dtype) + topp_tensor = paddle.to_tensor( + [ + self.topp, + ] + * self.batch_size, + self.dtype, + ).reshape((-1, 1)) + + # test case for basic test case 1 + paddle_result = paddle.tensor.top_p_sampling( + input_tensor, topp_tensor, seed=self.seed + ) + ref_res = TopPProcess(input_tensor, self.topp) + + np.testing.assert_allclose( + paddle_result[0].numpy(), ref_res[0].numpy(), rtol=1e-05 + ) + np.testing.assert_allclose( + paddle_result[1].numpy().flatten(), + ref_res[1].numpy().flatten(), + rtol=0, + ) + + # test case for basic test case 1 + paddle_result = paddle.tensor.top_p_sampling( + input_tensor, + topp_tensor, + seed=-1, + k=5, + mode="non-truncated", + return_top=True, + ) + ref_res = TopPProcess(input_tensor, self.topp) + + np.testing.assert_allclose( + paddle_result[0].numpy(), ref_res[0].numpy(), rtol=1e-05 + ) + np.testing.assert_allclose( + paddle_result[1].numpy().flatten(), + ref_res[1].numpy().flatten(), + rtol=0, + ) + + def run_static(self, place): + paddle.enable_static() + with paddle.static.program_guard( + paddle.static.Program(), paddle.static.Program() + ): + input_tensor = paddle.static.data( + name="x", shape=[6, 1030], dtype=self.dtype + ) + topp_tensor = paddle.static.data( + name="topp", shape=[6, 1], dtype=self.dtype + ) + result = paddle.tensor.top_p_sampling( + input_tensor, topp_tensor, seed=self.seed + ) + ref_res = TopPProcess(input_tensor, self.topp) + exe = paddle.static.Executor(place) + input_data = np.random.rand(6, 1030).astype(self.dtype) + paddle_result = exe.run( + feed={ + "x": input_data, + "topp": np.array( + [ + self.topp, + ] + * 6 + ).astype(self.dtype), + }, + fetch_list=[ + result[0], + result[1], + ref_res[0], + ref_res[1], + ], + ) + np.testing.assert_allclose(paddle_result[0], paddle_result[2], rtol=1e-05) + np.testing.assert_allclose(paddle_result[1], paddle_result[3], rtol=1e-05) + + def test_dygraph(self): + place = paddle.CustomPlace("metax_gpu", 0) + self.run_dygraph(place) + + def test_static(self): + place = paddle.CustomPlace("metax_gpu", 0) + self.run_static(place) + + +if __name__ == "__main__": + unittest.main() diff --git a/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py b/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py new file mode 100644 index 00000000000..ff22c2c9ac9 --- /dev/null +++ b/backends/metax_gpu/tests/unit_test/test_unsqueeze_op_metax.py @@ -0,0 +1,98 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest + +from tests.op_test import OpTest +import paddle + +paddle.enable_static() + + +# Correct: General. +class TestUnsqueezeOp(OpTest): + def setUp(self): + self.set_metax_gpu() + self.op_type = "unsqueeze2" + self.dtype = "float32" + self.init_test_case() + self.inputs = {"X": np.random.random(self.ori_shape).astype(self.dtype)} + self.init_attrs() + self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} + + def set_metax_gpu(self): + self.__class__.use_custom_device = True + self.place = paddle.CustomPlace("metax_gpu", 0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.ori_shape = (3, 40) + self.axes = (1, 2) + self.new_shape = (3, 1, 1, 40) + + def init_attrs(self): + self.attrs = {"axes": self.axes} + + +# Correct: Single input index. +class TestUnsqueezeOp1(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (-1,) + self.new_shape = (20, 5, 1) + + +# Correct: Mixed input axis. +class TestUnsqueezeOp2(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (20, 5) + self.axes = (0, -1) + self.new_shape = (1, 20, 5, 1) + + +# Correct: There is duplicated axis. +class TestUnsqueezeOp3(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (0, 3, 3) + self.new_shape = (1, 10, 2, 1, 1, 5) + + +# Correct: Reversed axes. +class TestUnsqueezeOp4(TestUnsqueezeOp): + def init_test_case(self): + self.ori_shape = (10, 2, 5) + self.axes = (3, 1, 1) + self.new_shape = (10, 1, 1, 2, 5, 1) + + +# test float16 +class TestUnsqueezeOp5(TestUnsqueezeOp): + def init_test_case(self): + self.dtype = "float16" + self.ori_shape = (10, 2, 5) + self.axes = (3, 1, 1) + self.new_shape = (10, 1, 1, 2, 5, 1) + + +if __name__ == "__main__": + unittest.main()