PaddlePaddle
diff --git a/‎cmake/external/cub.cmake‎
Lines changed: 7 additions & 2 deletions b/‎cmake/external/cub.cmake‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎cmake/third_party.cmake‎
Lines changed: 3 additions & 1 deletion b/‎cmake/third_party.cmake‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll.cu‎
Lines changed: 87 additions & 21 deletions b/‎paddle/fluid/distributed/collective/deep_ep/kernels/internode_ll.cu‎
Lines changed: 87 additions & 21 deletions
diff --git a/‎paddle/phi/infermeta/backward.cc‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/infermeta/backward.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/infermeta/unary.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/infermeta/unary.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/phi/kernels/cpu/concat_kernel.cc‎
Lines changed: 3 additions & 1 deletion b/‎paddle/phi/kernels/cpu/concat_kernel.cc‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/phi/kernels/cpu/reduce_sum_kernel.cc‎
Lines changed: 35 additions & 2 deletions b/‎paddle/phi/kernels/cpu/reduce_sum_kernel.cc‎
Lines changed: 35 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc‎
Lines changed: 4 additions & 0 deletions b/‎paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/cpu/temporal_shift_kernel.cc‎
Lines changed: 4 additions & 0 deletions b/‎paddle/phi/kernels/cpu/temporal_shift_kernel.cc‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/funcs/stack_and_unstack.h‎
Lines changed: 7 additions & 4 deletions b/‎paddle/phi/kernels/funcs/stack_and_unstack.h‎
Lines changed: 7 additions & 4 deletions
@@ -26,11 +26,16 @@ set(CUB_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/cub)
 
 if(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6)
   # cuda_11.6/11.7/11.8‘s own cub is 1.15.0, which will cause compiling error in windows.
-  set(CUB_TAG 1.16.0)
+  set(CUB_TAG 2.1.0)
   execute_process(COMMAND git --git-dir=${CUB_SOURCE_DIR}/.git
                           --work-tree=${CUB_SOURCE_DIR} checkout ${CUB_TAG})
-  # cub 1.16.0 is not compatible with current thrust version
+  # cub 2.1.0 is not compatible with current thrust version
   add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK)
+  if(${CMAKE_CUDA_COMPILER_VERSION} EQUAL 11.8)
+    set(cub_patches "${PADDLE_SOURCE_DIR}/patches/cub")
+    message(STATUS "Add cub patches: ${cub_patches}")
+    include_directories(${cub_patches})
+  endif()
 else()
   set(CUB_TAG 1.8.0)
 endif()
 
@@ -482,7 +482,9 @@ if(WITH_ONNXRUNTIME)
 endif()
 
 if(WITH_GPU)
-  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0
+     OR (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.7
+         AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.9))
     include(external/cub) # download cub
     list(APPEND third_party_deps extern_cub)
   elseif(${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 12.0 AND WITH_SHARED_PHI)
 
@@ -279,13 +279,33 @@ __global__ __launch_bounds__(
             rank * num_max_dispatch_tokens_per_rank * num_bytes_per_msg +
             slot_idx * num_bytes_per_msg;
         if (dst_rank != rank) {
-          nvshmemi_ibgda_put_nbi_warp(dst_ptr,
-                                      src_ptr,
-                                      num_bytes_per_msg,
-                                      dst_rank,
-                                      dst_expert_local_idx,
-                                      lane_id,
-                                      slot_idx);
+          void* peer_base_addr = reinterpret_cast<void*>(
+              __ldg(reinterpret_cast<const uint64_t*>(
+                        nvshmemi_device_state_d.peer_heap_base_p2p) +
+                    dst_rank));
+          if (peer_base_addr) {
+            char* req_rptr_actual =
+                reinterpret_cast<char*>(peer_base_addr) +
+                (reinterpret_cast<char*>(dst_ptr) -
+                 reinterpret_cast<char*>(nvshmemi_device_state_d.heap_base));
+            const auto* src_int4_ptr = reinterpret_cast<const int4*>(src_ptr);
+            const auto* dst_int4_ptr = reinterpret_cast<int4*>(req_rptr_actual);
+            UNROLLED_WARP_COPY(8,
+                               lane_id,
+                               num_int4_per_msg,
+                               dst_int4_ptr,
+                               src_int4_ptr,
+                               ld_nc_global,
+                               st_na_global);
+          } else {
+            nvshmemi_ibgda_put_nbi_warp(dst_ptr,
+                                        src_ptr,
+                                        num_bytes_per_msg,
+                                        dst_rank,
+                                        dst_expert_local_idx,
+                                        lane_id,
+                                        slot_idx);
+          }
         } else {
           // NOTES: only 2 load iterations for 7K hidden with 8 unrolls
           const auto* src_int4_ptr = reinterpret_cast<const int4*>(src_ptr);
@@ -367,11 +387,24 @@ __global__ __launch_bounds__(
                              responsible_expert_idx) != FINISHED_SUM_TAG * 2) {
     }
     if (dst_rank != rank) {
-      nvshmemi_ibgda_amo_nonfetch_add(
-          rdma_recv_count + dst_expert_local_idx * num_ranks + rank,
-          -num_tokens_sent - 1,
-          dst_rank,
-          dst_expert_local_idx);
+      void* peer_base_addr = reinterpret_cast<void*>(
+          __ldg(reinterpret_cast<const uint64_t*>(
+                    nvshmemi_device_state_d.peer_heap_base_p2p) +
+                dst_rank));
+      if (peer_base_addr) {  // P2P enabled
+        int* rptr_actual = reinterpret_cast<int*>(
+            reinterpret_cast<char*>(peer_base_addr) +
+            (reinterpret_cast<char*>(rdma_recv_count +
+                                     dst_expert_local_idx * num_ranks + rank) -
+             reinterpret_cast<char*>(nvshmemi_device_state_d.heap_base)));
+        st_na_release(rptr_actual, -num_tokens_sent - 1);
+      } else {
+        nvshmemi_ibgda_amo_nonfetch_add(
+            rdma_recv_count + dst_expert_local_idx * num_ranks + rank,
+            -num_tokens_sent - 1,
+            dst_rank,
+            dst_expert_local_idx);
+      }
     } else {
       st_na_release(rdma_recv_count + dst_expert_local_idx * num_ranks + rank,
                     -num_tokens_sent - 1);
@@ -691,13 +724,32 @@ __global__ __launch_bounds__(
                              x_int4,
                              ld_nc_global,
                              st_na_global);
-        nvshmemi_ibgda_put_nbi_warp(dst_ptr,
-                                    buf_ptr,
-                                    hidden * sizeof(nv_bfloat16),
-                                    dst_rank,
-                                    local_expert_idx,
-                                    lane_id,
-                                    token_idx - offset);
+        void* peer_base_addr = reinterpret_cast<void*>(
+            __ldg(reinterpret_cast<const uint64_t*>(
+                      nvshmemi_device_state_d.peer_heap_base_p2p) +
+                  dst_rank));
+        if (peer_base_addr) {
+          char* req_rptr_actual =
+              reinterpret_cast<char*>(peer_base_addr) +
+              (reinterpret_cast<char*>(dst_ptr) -
+               reinterpret_cast<char*>(nvshmemi_device_state_d.heap_base));
+          const auto dst_int4_ptr = reinterpret_cast<int4*>(req_rptr_actual);
+          UNROLLED_WARP_COPY(7,
+                             lane_id,
+                             hidden_bf16_int4,
+                             dst_int4_ptr,
+                             x_int4,
+                             ld_nc_global,
+                             st_na_global);
+        } else {
+          nvshmemi_ibgda_put_nbi_warp(dst_ptr,
+                                      buf_ptr,
+                                      hidden * sizeof(nv_bfloat16),
+                                      dst_rank,
+                                      local_expert_idx,
+                                      lane_id,
+                                      token_idx - offset);
+        }
       }
     }
 
@@ -710,8 +762,22 @@ __global__ __launch_bounds__(
       while (ld_acquire_global(atomic_clean_flag) == 0) {
       }
       if (dst_rank != rank) {
-        nvshmemi_ibgda_amo_nonfetch_add(
-            rdma_recv_flag + global_expert_idx, 1, dst_rank, local_expert_idx);
+        void* peer_base_addr = reinterpret_cast<void*>(
+            __ldg(reinterpret_cast<const uint64_t*>(
+                      nvshmemi_device_state_d.peer_heap_base_p2p) +
+                  dst_rank));
+        if (peer_base_addr) {
+          int* req_rptr_actual = reinterpret_cast<int*>(
+              reinterpret_cast<char*>(peer_base_addr) +
+              (reinterpret_cast<char*>(rdma_recv_flag + global_expert_idx) -
+               reinterpret_cast<char*>(nvshmemi_device_state_d.heap_base)));
+          st_na_release(req_rptr_actual, 1);
+        } else {
+          nvshmemi_ibgda_amo_nonfetch_add(rdma_recv_flag + global_expert_idx,
+                                          1,
+                                          dst_rank,
+                                          local_expert_idx);
+        }
       } else {
         st_na_release(rdma_recv_flag + global_expert_idx, 1);
       }
 
@@ -1747,8 +1747,8 @@ void UnStackGradInferMeta(const std::vector<const MetaTensor*>& out_grad,
                         rank));
   if (axis < 0) axis += (rank + 1);
 
-  auto vec = common::vectorize<int>(input_dims[0]);
-  vec.insert(vec.begin() + axis, static_cast<int>(input_dims.size()));
+  auto vec = common::vectorize<int64_t>(input_dims[0]);
+  vec.insert(vec.begin() + axis, static_cast<int64_t>(input_dims.size()));
   x_grad->set_dims(common::make_ddim(vec));
   x_grad->set_dtype(out_grad[0]->dtype());
 }
 
@@ -6075,7 +6075,7 @@ void UnStackInferMeta(const MetaTensor& x,
             x_dim[axis],
             num));
   }
-  auto vec = common::vectorize<int>(x_dim);
+  auto vec = common::vectorize<int64_t>(x_dim);
   vec.erase(vec.begin() + axis);
   for (size_t i = 0; i < output_count; i++) {
     outs[i]->set_dims(common::make_ddim(vec));
 
@@ -45,7 +45,9 @@ void ConcatKernel(const Context& dev_ctx,
   phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis);
   out->Resize(out_dims);
   dev_ctx.template Alloc<T>(out);
-
+  if (out->numel() == 0) {
+    return;
+  }
   // If axis is 0, the lod of the output is not the same as inputs.
   if (axis == 0 && !x[0]->lod().empty()) {
     size_t lod_size_0 = x[0]->lod().size();
 
@@ -44,8 +44,41 @@ void SumRawKernel(const Context& dev_ctx,
                            out);
     return;
   }
-  phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
-      dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out);
+  if constexpr (std::is_same_v<T, phi::dtype::float16> ||
+                std::is_same_v<T, phi::dtype::bfloat16>) {
+    DenseTensor x_fp32 = phi::Cast<T, Context>(dev_ctx, x, DataType::FLOAT32);
+    DataType final_out_dtype = out_dtype;
+    if (final_out_dtype == DataType::UNDEFINED) {
+      final_out_dtype = x.dtype();
+    }
+    if (final_out_dtype == DataType::FLOAT32) {
+      phi::Reduce<CPUContext, float, phi::funcs::SumFunctor>(
+          dev_ctx,
+          x_fp32,
+          reduce_all,
+          dims.GetData(),
+          keep_dim,
+          phi::DataType::UNDEFINED,
+          out);
+    } else {
+      DenseTensor intermediate_result;
+      intermediate_result.set_meta(out->meta());
+      phi::Reduce<CPUContext, float, phi::funcs::SumFunctor>(
+          dev_ctx,
+          x_fp32,
+          reduce_all,
+          dims.GetData(),
+          keep_dim,
+          phi::DataType::UNDEFINED,
+          &intermediate_result);
+
+      phi::CastKernel<float, Context>(
+          dev_ctx, intermediate_result, final_out_dtype, out);
+    }
+  } else {
+    phi::Reduce<CPUContext, T, phi::funcs::SumFunctor>(
+        dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out);
+  }
 }
 
 }  // namespace phi
 
@@ -89,6 +89,10 @@ void TemporalShiftGradKernel(const Context& dev_ctx,
                              float shift_ratio,
                              const std::string& data_format_str,
                              DenseTensor* x_grad) {
+  if (x_grad && x_grad->numel() == 0) {
+    dev_ctx.template Alloc<T>(x_grad);
+    return;
+  }
   auto* input_grad = x_grad;
   auto* output_grad = &out_grad;
   int t = seg_num;
 
@@ -89,6 +89,10 @@ void TemporalShiftKernel(const Context& dev_ctx,
                          float shift_ratio,
                          const std::string& data_format_str,
                          DenseTensor* out) {
+  if (out && out->numel() == 0) {
+    dev_ctx.template Alloc<T>(out);
+    return;
+  }
   auto* input = &x;
   auto* output = out;
   int t = seg_num;
 
@@ -210,7 +210,7 @@ void LaunchUnStackKernel(const Context& ctx,
     constexpr int kWarpSize = 32;
     constexpr int kMaxOut = 16;
 
-    int tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1;
+    int64_t tid_x = 0, tid_y = 0, bid_x = 0, bid_y = 1;
     if (split_dim < kMaxOut) {
       tid_y = split_dim;
       tid_x =
@@ -219,10 +219,13 @@ void LaunchUnStackKernel(const Context& ctx,
     } else {
       tid_y = kMaxOut;
       tid_x = kWarpSize;
-      bid_y = backends::gpu::DivUp<int>(split_dim, kMaxOut);
+      bid_y = backends::gpu::DivUp<int64_t>(split_dim, kMaxOut);
     }
-    int tile_x_num = backends::gpu::DivUp<int>(out_row, tid_x);
-    bid_x = std::min(tile_x_num, backends::gpu::kMultiDimslimit);
+    int64_t tile_x_num = backends::gpu::DivUp<int64_t>(out_row, tid_x);
+    if (tile_x_num < static_cast<int64_t>(backends::gpu::kMultiDimslimit))
+      bid_x = tile_x_num;
+    else
+      bid_x = backends::gpu::kMultiDimslimit;
     dim3 blocks(tid_x, tid_y, 1);
     dim3 grids(bid_x, bid_y, 1);
Original file line number	Diff line number	Diff line change
`@@ -6075,7 +6075,7 @@ void UnStackInferMeta(const MetaTensor& x,`
`6075`	`6075`	`x_dim[axis],`
`6076`	`6076`	`num));`
`6077`	`6077`	`}`
`6078`		`- auto vec = common::vectorize<int>(x_dim);`
	`6078`	`+ auto vec = common::vectorize<int64_t>(x_dim);`
`6079`	`6079`	`vec.erase(vec.begin() + axis);`
`6080`	`6080`	`for (size_t i = 0; i < output_count; i++) {`
`6081`	`6081`	`outs[i]->set_dims(common::make_ddim(vec));`