PaddlePaddle · SigureMo · Nov 10, 2025
diff --git a/paddle/phi/kernels/cpu/contiguous_kernel.cc b/paddle/phi/kernels/cpu/contiguous_kernel.cc
@@ -15,11 +15,39 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/dense_tensor_iterator.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/impl/transpose_grad_kernel_impl.h"
 
+#if defined(PADDLE_WITH_OPENMP)
+#include <omp.h>
+#endif
+
 namespace phi {
 
+inline int64_t DivUp(const int64_t& x, const int64_t& y) {
+  return (x + y - 1) / y;
+}
+
+inline void DealWithStride(const DenseTensorIterator& iter, int64_t* strides) {
+  for (int dim = 0; dim < iter.ndim(); dim++) {
+    for (int arg = 0; arg < iter.ntensors(); arg++) {
+      *strides++ = iter.strides(arg)[dim];
+    }
+  }
+  if (iter.ndim() < 2) {
+    std::fill_n(strides, (2 - iter.ndim()) * iter.ntensors(), 0);
+  }
+}
+
+inline bool FastTransposeCopyValid(const DenseTensor& self,
+                                   const DenseTensor& src) {
+  constexpr int64_t MIN_NUMEL = 360;
+  return src.numel() != 0 && src.dims().size() == 2 && src.strides()[0] == 1 &&
+         src.strides()[1] == src.dims()[0] &&
+         self.dims().size() == src.dims().size() && self.numel() >= MIN_NUMEL;
+}
+
 template <typename T, typename Context>
 void ContiguousKernel(const Context& dev_ctx,
                       const DenseTensor& input,
@@ -31,21 +59,145 @@ void ContiguousKernel(const Context& dev_ctx,
 
   const T* input_data = input.data<T>();
   T* output_data = dev_ctx.template Alloc<T>(out);
-  int rank = input.dims().size();
-  auto dims = input.dims();
-  auto input_stride = input.strides();
   auto numel = input.numel();
 
-  for (int64_t i = 0; i < numel; i++) {
-    int64_t input_offset = 0;
-    int64_t index_tmp = i;
-    for (int dim = rank - 1; dim >= 0; --dim) {
-      int64_t mod = index_tmp % dims[dim];
-      index_tmp = index_tmp / dims[dim];
-      input_offset += mod * input_stride[dim];
+  if (FastTransposeCopyValid(*out, input)) {
+    // Fast path for 2D transpose
+    constexpr int64_t TRANS_NUMEL = 60;
+    void* trans_buffer =
+        malloc(phi::SizeOf(input.dtype()) * TRANS_NUMEL * TRANS_NUMEL);
+
+    const T* tmp_src_ptr = input_data;
+    T* tmp_out_ptr = output_data;
+    T* tmp_buf_ptr = reinterpret_cast<T*>(trans_buffer);
+
+    int64_t dim0 = out->dims()[0];
+    int64_t dim1 = out->dims()[1];
+
+    for (int64_t d0 = 0; d0 < dim0; d0 += TRANS_NUMEL) {
+      for (int64_t d1 = 0; d1 < dim1; d1 += TRANS_NUMEL) {
+        const T* src_ptr_inter = tmp_src_ptr + d0 + d1 * dim0;
+        T* out_ptr_inter = tmp_out_ptr + d1 + d0 * dim1;
+
+        int nr = std::min(dim0 - d0, TRANS_NUMEL);
+        int nc = std::min(dim1 - d1, TRANS_NUMEL);
+
+        for (int c = 0; c < nc; c++) {
+          memcpy(tmp_buf_ptr + c * TRANS_NUMEL,
+                 src_ptr_inter + c * dim0,
+                 nr * sizeof(T));
+        }
+
+        int rc_max = std::max(nr, nc);
+        int rc_min = std::min(nr, nc);
+        for (int r = 0; r < rc_max; r++) {
+          int end = std::min(r, rc_min);
+          for (int c = 0; c < end; c++) {
+            T tmp = tmp_buf_ptr[r + TRANS_NUMEL * c];
+            tmp_buf_ptr[r + TRANS_NUMEL * c] = tmp_buf_ptr[r * TRANS_NUMEL + c];
+            tmp_buf_ptr[r * TRANS_NUMEL + c] = tmp;
+          }
+        }
+
+        for (int r = 0; r < nr; r++) {
+          memcpy(out_ptr_inter + r * dim1,
+                 tmp_buf_ptr + r * TRANS_NUMEL,
+                 nc * sizeof(T));
+        }
+      }
+    }
+    free(trans_buffer);
+  } else {
+#if defined(PADDLE_WITH_OPENMP)
+    // OpenMP parallel path
+    phi::DenseTensorIteratorConfig config;
+    config.add_output(*out);
+    config.add_const_input(input);
+    config.is_alloc_out_ = true;
+    phi::DenseTensorIterator iter = config.build();
+
+    std::vector<int64_t> tmp_strides(
+        iter.ntensors() * static_cast<size_t>(std::max(iter.ndim(), 2)));
+
+    DealWithStride(iter, tmp_strides.data());
+
+    std::vector<int64_t> out_stride(tmp_strides.begin() + iter.ntensors(),
+                                    tmp_strides.end());
+
+    const int64_t& iter_numel = iter.numel();
+
+    const char* in_ptr = reinterpret_cast<const char*>(input_data);
+    char* out_ptr = reinterpret_cast<char*>(output_data);
+
+    int64_t end = iter_numel;
+    int64_t begin = 0;
+    int64_t grain_size = 32768;
+
+    int64_t* whole_stride = tmp_strides.data();
+
+#pragma omp parallel
+    {
+      int64_t num_threads = omp_get_num_threads();
+
+      if (grain_size > 0) {
+        num_threads = std::min(num_threads, DivUp((end - begin), grain_size));
+      }
+
+      int64_t tid = omp_get_thread_num();
+      int64_t chunk_size = DivUp((end - begin), num_threads);
+      int64_t begin_tid = begin + tid * chunk_size;
+
+      if (begin_tid < end) {
+        int64_t range_start = begin_tid;
+        int64_t range_end = std::min(end, chunk_size + begin_tid);
+
+        auto dimiter = DimIter(iter.shape(), range_start, range_end);
+        while (!dimiter.iter_to_end()) {
+          const auto v_ndim = dimiter.values.size();
+          const char* tmp_in_data = in_ptr;
+          char* tmp_out_data = out_ptr;
+          for (size_t dim = 0; dim < v_ndim; dim++) {
+            int64_t value = dimiter.values[dim];
+            tmp_out_data += value * whole_stride[dim * iter.ntensors() + 0];
+            tmp_in_data += value * whole_stride[dim * iter.ntensors() + 1];
+          }
+
+          auto step = dimiter.iter_for_step();
+
+          for (int64_t i = 0; i < step[1]; i++) {
+            for (int64_t j = 0; j < step[0]; j++) {
+              const char* real_in_ptr = tmp_in_data + j * whole_stride[1];
+              char* real_out_ptr = tmp_out_data + j * whole_stride[0];
+
+              *reinterpret_cast<T*>(real_out_ptr) =
+                  *reinterpret_cast<const T*>(real_in_ptr);
+            }
+            tmp_in_data = tmp_in_data + out_stride[1];
+            tmp_out_data = tmp_out_data + out_stride[0];
+          }
+
+          dimiter.iter_to_next(step);
+        }
+      }
     }
+#else
+    // Serial fallback path
+    int rank = input.dims().size();
+    auto dims = input.dims();
+    auto input_stride = input.strides();
 
-    output_data[i] = input_data[input_offset];
+    for (int64_t i = 0; i < numel; i++) {
+      int64_t input_offset = 0;
+      int64_t index_tmp = i;
+      for (int dim = rank - 1; dim >= 0; --dim) {
+        int64_t mod = index_tmp % dims[dim];
+        index_tmp = index_tmp / dims[dim];
+        input_offset += mod * input_stride[dim];
+      }
+
+      output_data[i] = input_data[input_offset];
+    }
+#endif
   }
 }
 }  // namespace phi