diff --git a/onnxruntime/core/providers/cuda/tensor/upsample.cc b/onnxruntime/core/providers/cuda/tensor/upsample.cc index cbf745d3c7b4f..a38fe1efad540 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample.cc +++ b/onnxruntime/core/providers/cuda/tensor/upsample.cc @@ -290,16 +290,16 @@ Status Upsample::BaseCompute(OpKernelContext* context, scales_div[i] = fast_divmod(gsl::narrow_cast(ceil(scales[i]))); } - UpampleImpl(Stream(context), - mode_, - rank, - (UpsampleMode::LINEAR == mode_) ? (rank == 2 ? X_dims[0] : X_dims[2]) : 0, - input_strides, - output_div_pitches, - scales_div, - reinterpret_cast(X->Data()), - reinterpret_cast(Y->MutableData()), - output_count); + UpsampleImpl(Stream(context), + mode_, + rank, + (UpsampleMode::LINEAR == mode_) ? (rank == 2 ? X_dims[0] : X_dims[2]) : 0, + input_strides, + output_div_pitches, + scales_div, + reinterpret_cast(X->Data()), + reinterpret_cast(Y->MutableData()), + output_count); } return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu b/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu index d1c2ae6332994..24aeada559979 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/upsample_impl.cu @@ -8,12 +8,12 @@ namespace onnxruntime { namespace cuda { template -__global__ void _UpampleNearestKernel(const TArray input_pitches, - const TArray output_div_pitches, - const TArray scales_div, - const T* __restrict__ input_data, - T* __restrict__ output_data, - const size_t N) { +__global__ void _UpsampleNearestKernel(const TArray input_pitches, + const TArray output_div_pitches, + const TArray scales_div, + const T* __restrict__ input_data, + T* __restrict__ output_data, + const size_t N) { CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); CUDA_LONG input_index = 0; CUDA_LONG output_index = id; @@ -36,13 +36,13 @@ __global__ void _UpampleNearestKernel(const TArray input_pitches, // This is the common use-case where the 4-D input (batched multi-channel images) // is usually of shape [N, C, H, W] and the scales are [1.0, 1.0, height_scale, width_scale] template -__global__ void _UpampleBilinear4DInputKernel(const int64_t input_dim2, - const TArray input_pitches, - const TArray output_div_pitches, - const TArray scales_div, - const T* __restrict__ input_data, - T* __restrict__ output_data, - const size_t N) { +__global__ void _UpsampleBilinear4DInputKernel(const int64_t input_dim2, + const TArray input_pitches, + const TArray output_div_pitches, + const TArray scales_div, + const T* __restrict__ input_data, + T* __restrict__ output_data, + const size_t N) { CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); CUDA_LONG input_index = 0; @@ -95,13 +95,13 @@ __global__ void _UpampleBilinear4DInputKernel(const int64_t input_dim2, // The following method supports a 2-D input in 'Linear mode' template -__global__ void _UpampleBilinear2DInputKernel(const int64_t input_dim0, - const TArray input_pitches, - const TArray output_div_pitches, - const TArray scales_div, - const T* __restrict__ input_data, - T* __restrict__ output_data, - const size_t N) { +__global__ void _UpsampleBilinear2DInputKernel(const int64_t input_dim0, + const TArray input_pitches, + const TArray output_div_pitches, + const TArray scales_div, + const T* __restrict__ input_data, + T* __restrict__ output_data, + const size_t N) { CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id, N); CUDA_LONG input_index = 0; @@ -147,32 +147,32 @@ __global__ void _UpampleBilinear2DInputKernel(const int64_t input_dim0, } template -void UpampleImpl(cudaStream_t stream, - const onnxruntime::UpsampleMode upsample_mode, - const size_t rank, - const int64_t input_dim2, - const TArray& input_pitches, - const TArray& output_div_pitches, - const TArray& scales_div, - const T* input_data, - T* output_data, - const size_t N) { +void UpsampleImpl(cudaStream_t stream, + const onnxruntime::UpsampleMode upsample_mode, + const size_t rank, + const int64_t input_dim2, + const TArray& input_pitches, + const TArray& output_div_pitches, + const TArray& scales_div, + const T* input_data, + T* output_data, + const size_t N) { int blocksPerGrid = (int)(ceil(static_cast(N) / GridDim::maxThreadsPerBlock)); if (onnxruntime::UpsampleMode::NN == upsample_mode) { if (rank == 4) { - _UpampleNearestKernel<<>>( + _UpsampleNearestKernel<<>>( input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } else if (rank == 3) { - _UpampleNearestKernel<<>>( + _UpsampleNearestKernel<<>>( input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } else if (rank == 2) { - _UpampleNearestKernel<<>>( + _UpsampleNearestKernel<<>>( input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } else if (rank == 1) { - _UpampleNearestKernel<<>>( + _UpsampleNearestKernel<<>>( input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } else { @@ -180,11 +180,11 @@ void UpampleImpl(cudaStream_t stream, } } else if (onnxruntime::UpsampleMode::LINEAR == upsample_mode) { if (rank == 4) { - _UpampleBilinear4DInputKernel<<>>( + _UpsampleBilinear4DInputKernel<<>>( input_dim2, input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } else if (rank == 2) { - _UpampleBilinear2DInputKernel<<>>( + _UpsampleBilinear2DInputKernel<<>>( input_dim2, input_pitches, output_div_pitches, scales_div, input_data, output_data, N); } else { @@ -197,17 +197,17 @@ void UpampleImpl(cudaStream_t stream, } } -#define SPECIALIZED_IMPL(T) \ - template void UpampleImpl(cudaStream_t stream, \ - const onnxruntime::UpsampleMode upsample_mode, \ - const size_t rank, \ - const int64_t input_dim2, \ - const TArray& input_pitches, \ - const TArray& output_div_pitches, \ - const TArray& scales_div, \ - const T* input_data, \ - T* output_data, \ - const size_t N); +#define SPECIALIZED_IMPL(T) \ + template void UpsampleImpl(cudaStream_t stream, \ + const onnxruntime::UpsampleMode upsample_mode, \ + const size_t rank, \ + const int64_t input_dim2, \ + const TArray& input_pitches, \ + const TArray& output_div_pitches, \ + const TArray& scales_div, \ + const T* input_data, \ + T* output_data, \ + const size_t N); SPECIALIZED_IMPL(float) SPECIALIZED_IMPL(double) diff --git a/onnxruntime/core/providers/cuda/tensor/upsample_impl.h b/onnxruntime/core/providers/cuda/tensor/upsample_impl.h index 250ec6b272e34..fb47ad8301615 100644 --- a/onnxruntime/core/providers/cuda/tensor/upsample_impl.h +++ b/onnxruntime/core/providers/cuda/tensor/upsample_impl.h @@ -11,16 +11,16 @@ namespace onnxruntime { namespace cuda { template -void UpampleImpl(cudaStream_t stream, - const onnxruntime::UpsampleMode upsample_mode, - const size_t rank, - const int64_t input_dim2, - const TArray& input_pitches, - const TArray& output_div_pitches, - const TArray& scales_div, - const T* input_data, - T* output_data, - const size_t N); +void UpsampleImpl(cudaStream_t stream, + const onnxruntime::UpsampleMode upsample_mode, + const size_t rank, + const int64_t input_dim2, + const TArray& input_pitches, + const TArray& output_div_pitches, + const TArray& scales_div, + const T* input_data, + T* output_data, + const size_t N); } // namespace cuda } // namespace onnxruntime