diff --git a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h index fb1368b069c..b517b719d49 100644 --- a/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h +++ b/backends/metax_gpu/kernels/impl/addmm_kernel_impl.h @@ -98,6 +98,7 @@ void AddmmKernel(const Context& dev_ctx, y_dims[0])); dev_ctx.template Alloc(out); + if (out->numel() == 0) return; auto blas = funcs::GetBlas(dev_ctx); // calc broadcast dim diff --git a/backends/metax_gpu/patch/paddle.patch b/backends/metax_gpu/patch/paddle.patch index 4c06609338c..69d714ef6e0 100755 --- a/backends/metax_gpu/patch/paddle.patch +++ b/backends/metax_gpu/patch/paddle.patch @@ -438,6 +438,21 @@ index d69eb67d6f..1d8b6e9375 100644 #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/math_function.h" +diff --git a/paddle/phi/kernels/funcs/embedding_grad.h b/paddle/phi/kernels/funcs/embedding_grad.h +index 461e6e2474..48a64ae9ce 100644 +--- a/paddle/phi/kernels/funcs/embedding_grad.h ++++ b/paddle/phi/kernels/funcs/embedding_grad.h +@@ -143,8 +143,8 @@ void LaunchEmbeddingGradDeterministicKernel(const GPUContext& dev_ctx, + constexpr int kWarpSize = 64; + constexpr int kBlockDimY = 16; + #else +- constexpr int kWarpSize = 32; +- constexpr int kBlockDimY = 32; ++ constexpr int kWarpSize = 64; ++ constexpr int kBlockDimY = 16; + #endif + dim3 threads(kWarpSize, kBlockDimY); + dim3 grids(static_cast((D + kWarpSize - 1) / kWarpSize)); diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu index cb35feee32..64f5bd24ac 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu @@ -501,6 +516,49 @@ index 15e1a4a3c3..e4780538d7 100644 #include "paddle/phi/kernels/funcs/im2col.h" namespace phi { +diff --git a/paddle/phi/kernels/funcs/math_cuda_utils.h b/paddle/phi/kernels/funcs/math_cuda_utils.h +index e5361b836e..5ad238df08 100644 +--- a/paddle/phi/kernels/funcs/math_cuda_utils.h ++++ b/paddle/phi/kernels/funcs/math_cuda_utils.h +@@ -175,12 +175,12 @@ struct KeyValuePair { + #define WARP_SIZE_WIDTH_MASK 0x3f + typedef u_int64_t warp_mask_t; + #else +-#define FINAL_MASK 0xffffffff +-#define HALF_WARP 16 +-#define WARP_SIZE 32 +-#define WARP_SIZE_WIDTH 5 +-#define WARP_SIZE_WIDTH_MASK 0x1f +-typedef unsigned warp_mask_t; ++#define FINAL_MASK 0xffffffffffffffffUL ++#define HALF_WARP 32 ++#define WARP_SIZE 64 ++#define WARP_SIZE_WIDTH 6 ++#define WARP_SIZE_WIDTH_MASK 0x3f ++typedef u_int64_t warp_mask_t; + #endif + + template +@@ -200,19 +200,13 @@ __inline__ __device__ T BlockReduceSum(T val, warp_mask_t mask) { + static __shared__ T shared[WARP_SIZE]; + int lane = threadIdx.x & WARP_SIZE_WIDTH_MASK; + int wid = threadIdx.x >> WARP_SIZE_WIDTH; +- + val = WarpReduceSum(val, mask); +- +- __syncthreads(); + if (lane == 0) shared[wid] = val; +- + __syncthreads(); +- + // align block_span to warpSize + int block_span = (blockDim.x + warpSize - 1) >> WARP_SIZE_WIDTH; + val = (lane < block_span) ? shared[lane] : static_cast(0.0f); + val = WarpReduceSum(val, mask); +- + return val; + } + diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu b/paddle/phi/kernels/funcs/matrix_inverse.cu index e101224970..a52eb6096f 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu @@ -534,7 +592,7 @@ index 558d363b39..05da04b517 100644 #include "paddle/phi/kernels/funcs/scatter.cu.h" diff --git a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu -index 8b0baf5f5f..260482f124 100644 +index 047f52bd91..a05b34d3ba 100644 --- a/paddle/phi/kernels/funcs/multihead_matmul_functor.cu +++ b/paddle/phi/kernels/funcs/multihead_matmul_functor.cu @@ -27,7 +27,7 @@ namespace cub = hipcub;