diff --git a/onnxruntime/core/providers/cpu/tensor/tile.cc b/onnxruntime/core/providers/cpu/tensor/tile.cc index 9f26e4e88a5f8..73f5122aebb90 100644 --- a/onnxruntime/core/providers/cpu/tensor/tile.cc +++ b/onnxruntime/core/providers/cpu/tensor/tile.cc @@ -100,20 +100,32 @@ Status TileCoreForFixedSizeTypes(const Tensor& input_tensor, Tensor& output_tens } namespace TileOp { -// Find the first non-1 repeat and check the input shape to the left of that dimension, -// if the dim values are 1, then the tiling logic is essentially copying the input buffer -// multiple times. The number of times can be computed as the product of the repeat values. +// Find the first non-1 repeat and check the input shape to the left of that dimension: +// 1) If the dim values to the left are all 1s (or don't exist), then the tiling logic is essentially copying the input buffer +// multiple times. The number of times can be computed as the product of the repeat values. (OR) +// 2) Allow at-most one non-1 dim value to the left (for the batch dimension), in this case, the sub-tensor at each batch index +// is copied multiple times. This is still faster because it avoids other Tile operator's machinery. bool IsTileMemcpy(const TensorShape& input_shape, const int64_t* repeats, size_t rank, - /*out*/ size_t& num_of_copies) { + /*out*/ bool& is_batched_memcpy, + /*out*/ size_t& num_of_elements_per_batch, + /*out*/ size_t& num_of_copies_per_batch, + /*out*/ size_t& num_of_batch_copies) { for (int64_t i = static_cast(rank) - 1; i >= 0; --i) { if (repeats[i] != 1) { if (input_shape.SizeToDimension(i) == 1) { - num_of_copies = 1; + num_of_copies_per_batch = 1; for (int64_t j = 0; j <= i; ++j) { - num_of_copies *= repeats[j]; + num_of_copies_per_batch *= repeats[j]; } + is_batched_memcpy = false; + return true; + } else if (i == 1) { // else check if the previous dim is just the batch dim + num_of_elements_per_batch = static_cast(input_shape.SizeFromDimension(1)); + num_of_copies_per_batch = repeats[i]; + num_of_batch_copies = repeats[0]; + is_batched_memcpy = true; return true; } else { break; @@ -166,20 +178,57 @@ Status Tile::Compute(OpKernelContext* ctx) const { return Status::OK(); } - size_t num_of_copies = 1; - if (TileOp::IsTileMemcpy(input_shape, repeats, input_rank, num_of_copies)) { + bool is_batched_memcpy = false; + size_t num_of_elements_per_batch = 1; + size_t num_of_copies_per_batch = 1; + size_t num_of_batch_copies = 1; + if (TileOp::IsTileMemcpy(input_shape, + repeats, + input_rank, + is_batched_memcpy, + num_of_elements_per_batch, + num_of_copies_per_batch, + num_of_batch_copies)) { // TODO: Handle string copies when the kernel eventually supports string type. // For now, it shouldn't throw in the enforce as the kernel doesn't claim string support ORT_ENFORCE(!input_tensor.IsDataType(), "Tile doesn't support string type yet"); int8_t* output_data_casted = reinterpret_cast(output_tensor.MutableDataRaw()); + const int8_t* input_data_casted = reinterpret_cast(input_tensor.DataRaw()); const void* input_data_raw = input_tensor.DataRaw(); - size_t tensor_size_in_bytes = input_tensor.SizeInBytes(); - // TODO: Add multi-threading logic if num_of_copies is large enough - for (size_t i = 0; i < num_of_copies; ++i) { - memcpy(static_cast(output_data_casted), input_data_raw, tensor_size_in_bytes); - output_data_casted += tensor_size_in_bytes; + if (!is_batched_memcpy) { + size_t copy_bytes = input_tensor.SizeInBytes(); + // TODO: Add multi-threading logic if num_of_copies_per_batch is large enough + for (size_t i = 0; i < num_of_copies_per_batch; ++i) { + memcpy(static_cast(output_data_casted), input_data_raw, copy_bytes); + output_data_casted += copy_bytes; + } + } else { + size_t copy_bytes = num_of_elements_per_batch * input_tensor.DataType()->Size(); + size_t batch_count = static_cast(input_tensor.Shape()[0]); // The tensor is atleast 1-D- this is safe + + // TODO: Multi-thread if needed + for (size_t batch = 0; batch < batch_count; ++batch) { + for (size_t i = 0; i < num_of_copies_per_batch; ++i) { + memcpy(static_cast(output_data_casted), static_cast(input_data_casted), copy_bytes); + output_data_casted += copy_bytes; + } + input_data_casted += copy_bytes; + } + + // Now account for batch dim repeat + if (num_of_batch_copies > 1) { + // reset some values + output_data_casted = reinterpret_cast(output_tensor.MutableDataRaw()); + copy_bytes *= num_of_copies_per_batch * batch_count; + int8_t* copy_ptr = output_data_casted + copy_bytes; + + for (size_t i = 1; i < num_of_batch_copies; ++i) { + memcpy(static_cast(copy_ptr), static_cast(output_data_casted), copy_bytes); + copy_ptr += copy_bytes; + } + } } return Status::OK(); diff --git a/onnxruntime/core/providers/cpu/tensor/tile.h b/onnxruntime/core/providers/cpu/tensor/tile.h index 74d8ee7ded3ac..093ee74969bf8 100644 --- a/onnxruntime/core/providers/cpu/tensor/tile.h +++ b/onnxruntime/core/providers/cpu/tensor/tile.h @@ -14,10 +14,24 @@ namespace TileOp { // repeats: [1, 200, 1] // output shape: [1, 200, 256 * 50] +// As a slight extension, it also supports "batched" multiple copies of the input data buffer +// (`is_batched_memcpy` will be set to true) +// E.g.: input_shape: [5, 1, 256 * 50] +// repeats: [1, 200, 1] +// output shape: [5, 200, 256 * 50] + +// Repeating the batch is also supported +// E.g.: input_shape: [5, 1, 256 * 50] +// repeats: [2, 200, 1] +// output shape: [10, 200, 256 * 50] + bool IsTileMemcpy(const TensorShape& input_shape, const int64_t* repeats, size_t rank, - /*out*/ size_t& num_of_copies); + /*out*/ bool& is_batched_memcpy, + /*out*/ size_t& num_of_elements_per_batch, + /*out*/ size_t& num_of_copies_per_batch, + /*out*/ size_t& num_of_batch_copies); } // namespace TileOp struct Tile : OpKernel { diff --git a/onnxruntime/core/providers/cuda/tensor/tile.cc b/onnxruntime/core/providers/cuda/tensor/tile.cc index 5aac0d0430525..7b68b19ce38be 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile.cc +++ b/onnxruntime/core/providers/cuda/tensor/tile.cc @@ -76,31 +76,73 @@ Status Tile::ComputeInternal(OpKernelContext* ctx) const { return Status::OK(); } - size_t num_of_copies = 1; - if (TileOp::IsTileMemcpy(input_shape, repeats, rank, num_of_copies)) { - if (input_tensor.IsDataType() || - input_tensor.IsDataType()) { - TileMemcpyImpl( - reinterpret_cast::MappedType*>(input_data), - input_shape.Size(), - reinterpret_cast::MappedType*>(output_data), - output_shape.Size()); - } else if (input_tensor.IsDataType() || - input_tensor.IsDataType()) { - TileMemcpyImpl( - reinterpret_cast::MappedType*>(input_data), - input_shape.Size(), - reinterpret_cast::MappedType*>(output_data), - output_shape.Size()); - } else if (input_tensor.IsDataType()) { - TileMemcpyImpl( - reinterpret_cast::MappedType*>(input_data), - input_shape.Size(), - reinterpret_cast::MappedType*>(output_data), - output_shape.Size()); + bool is_batched_memcpy = false; + size_t num_of_elements_per_batch = 1; + size_t num_of_copies_per_batch = 1; + size_t num_of_batch_copies = 1; + if (TileOp::IsTileMemcpy(input_shape, + repeats, + rank, + is_batched_memcpy, + num_of_elements_per_batch, + num_of_copies_per_batch, + num_of_batch_copies)) { + if (!is_batched_memcpy) { + if (input_tensor.IsDataType() || + input_tensor.IsDataType()) { + TileMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + input_shape.Size(), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else if (input_tensor.IsDataType() || + input_tensor.IsDataType()) { + TileMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + input_shape.Size(), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else if (input_tensor.IsDataType()) { + TileMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + input_shape.Size(), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else { + // Won't hit this as the kernel doesn't claim support for any type that will trigger this + ORT_THROW("Tile doesn't have an implementation yet for the type: ", input_tensor.DataType()); + } } else { - // Won't hit this as the kernel doesn't claim support for any type that will trigger this - ORT_THROW("Tile doesn't have an implementation yet for the type: ", input_tensor.DataType()); + if (input_tensor.IsDataType() || + input_tensor.IsDataType()) { + TileBatchedMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + num_of_elements_per_batch, + input_shape[0], // The tensor is atleast 1-D- this is safe + fast_divmod(static_cast(num_of_elements_per_batch * num_of_copies_per_batch)), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else if (input_tensor.IsDataType() || + input_tensor.IsDataType()) { + TileBatchedMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + num_of_elements_per_batch, + input_shape[0], // The tensor is atleast 1-D- this is safe + fast_divmod(static_cast(num_of_elements_per_batch * num_of_copies_per_batch)), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else if (input_tensor.IsDataType()) { + TileBatchedMemcpyImpl( + reinterpret_cast::MappedType*>(input_data), + num_of_elements_per_batch, + input_shape[0], // The tensor is atleast 1-D- this is safe + fast_divmod(static_cast(num_of_elements_per_batch * num_of_copies_per_batch)), + reinterpret_cast::MappedType*>(output_data), + output_shape.Size()); + } else { + // Won't hit this as the kernel doesn't claim support for any type that will trigger this + ORT_THROW("Tile doesn't have an implementation yet for the type: ", input_tensor.DataType()); + } } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu index 33696d1a26076..a66db85a2fd6c 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/tile_impl.cu @@ -67,9 +67,43 @@ void TileMemcpyImpl( input_data, num_input_elements, output_data, (CUDA_LONG)num_output_elements); } +template +__global__ void _TileBatchedMemcpyKernel( + const T* input_data, + const size_t num_of_elements_per_input_batch, + const size_t num_input_batch_count, + const fast_divmod num_of_elements_per_output_batch, + T* output_data, + const size_t N) { + CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); + CUDA_LONG batch_idx = 0; + CUDA_LONG element_idx = 0; + num_of_elements_per_output_batch.divmod(id, batch_idx, element_idx); + output_data[id] = input_data[(batch_idx % num_input_batch_count) * num_of_elements_per_input_batch + (element_idx % num_of_elements_per_input_batch)]; +} + +template +void TileBatchedMemcpyImpl( + const T* input_data, + const size_t num_of_elements_per_input_batch, + const size_t num_input_batch_count, + const fast_divmod& num_of_elements_per_output_batch, + T* output_data, + const size_t num_output_elements) { + int blocksPerGrid = (int)(ceil(static_cast(num_output_elements) / GridDim::maxThreadsPerBlock)); + _TileBatchedMemcpyKernel<<>>( + input_data, + num_of_elements_per_input_batch, + num_input_batch_count, + num_of_elements_per_output_batch, + output_data, + (CUDA_LONG)num_output_elements); +} + #define SPECIALIZED_IMPL(T) \ template void TileImpl(const size_t shape_rank, const TArray& fdm_input_shape, const TArray& input_stride, const T* input_data, const TArray& fdm_output_strides, T* output_data, const size_t N); \ - template void TileMemcpyImpl(const T* input_data, const size_t num_input_elements, T* output_data, const size_t num_output_elements); + template void TileMemcpyImpl(const T* input_data, const size_t num_input_elements, T* output_data, const size_t num_output_elements); \ + template void TileBatchedMemcpyImpl(const T* input_data, const size_t num_of_elements_per_input_batch, const size_t num_input_batch_count, const fast_divmod& num_of_elements_per_output_batch, T* output_data, const size_t num_output_elements); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/tensor/tile_impl.h b/onnxruntime/core/providers/cuda/tensor/tile_impl.h index cfe5391073f68..27404c8d39b6b 100644 --- a/onnxruntime/core/providers/cuda/tensor/tile_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/tile_impl.h @@ -25,5 +25,14 @@ void TileMemcpyImpl( T* output_data, const size_t num_output_elements); +template +void TileBatchedMemcpyImpl( + const T* input_data, + const size_t num_of_elements_per_input_batch, + const size_t num_input_batch_count, + const fast_divmod& num_of_elements_per_output_batch, + T* output_data, + const size_t num_output_elements); + } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc b/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc index 4d5c2384f7aec..e7636fbba1b44 100644 --- a/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/tile_op_test.cc @@ -48,10 +48,21 @@ void RunTestWrapper() { RunTest({111, 112, 113, 122, 123, 124}, {2, 1, 3}, {1, 1, 1}, {3}, {111, 112, 113, 122, 123, 124}, {2, 1, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 1 + // This will trigger the MemCpy optimization path RunTest({111, 112, 113}, {1, 1, 3}, {2, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 111, 112, 113, 111, 112, 113}, {2, 2, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 2 + // This will trigger the MemCpy optimization path RunTest({111, 112, 113}, {1, 1, 3}, {3, 1, 1}, {3}, {111, 112, 113, 111, 112, 113, 111, 112, 113}, {3, 1, 3}); + + // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1) + // This will trigger the (Batched) MemCpy optimization path + RunTest({111, 112, 113, 11, 12, 13}, {2, 1, 3}, {1, 2, 1}, {3}, {111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13}, {2, 2, 3}); + + // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1) + // This will trigger the (Batched) MemCpy optimization path + RunTest({111, 112, 113, 11, 12, 13}, {2, 1, 3}, {2, 2, 1}, {3}, + {111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13, 111, 112, 113, 111, 112, 113, 11, 12, 13, 11, 12, 13}, {4, 2, 3}); } template <> @@ -78,10 +89,23 @@ void RunTestWrapper() { RunTest({true, false, true, false, true, true}, {2, 1, 3}, {1, 1, 1}, {3}, {true, false, true, false, true, true}, {2, 1, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 1 + // This will trigger the MemCpy optimization path RunTest({true, false, true}, {1, 1, 3}, {2, 2, 1}, {3}, {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}); // TileWhichIsBasicallyCopiesOfInputBuffer - 2 + // This will trigger the MemCpy optimization path RunTest({true, false, true}, {1, 1, 3}, {3, 1, 1}, {3}, {true, false, true, true, false, true, true, false, true}, {3, 1, 3}); + + // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat == 1) + // This will trigger the (Batched) MemCpy optimization path + RunTest({true, false, true, true, false, true}, {2, 1, 3}, {1, 2, 1}, {3}, + {true, false, true, true, false, true, true, false, true, true, false, true}, {2, 2, 3}); + + // TileWhichIsBasicallyCopiesOfInputBuffer - 3 (batch > 1 and batch_repeat > 1) + // This will trigger the (Batched) MemCpy optimization path + RunTest({true, false, true, true, false, true}, {2, 1, 3}, {2, 2, 1}, {3}, + {true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true, true, false, true}, + {4, 2, 3}); } TEST(TensorOpTest, TileFloatType) {