Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions onnxruntime/core/providers/cpu/object_detection/roialign.cc
Original file line number Diff line number Diff line change
Expand Up @@ -295,13 +295,9 @@ Status CheckROIAlignValidInput(const Tensor* X_ptr, const Tensor* rois_ptr, cons
"First dimension (num_rois) of batch_indices and rois don't match");
}

// Validate batch_indices values are within [0, batch_size).
// Only check when the tensor data is accessible from the host (CPU).
// For GPU tensors (e.g. CUDA EP), Data<T>() returns a device pointer
// that cannot be safely dereferenced on the host. A device-side bounds
// check for the CUDA path would require passing batch_size into the
// CUDA kernel — tracked as a follow-up.
if (batch_indices_ptr->Location().device.Type() == OrtDevice::CPU) {
// Validate batch_indices values are within [0, batch_size) when the tensor
// data is accessible from the host (CPU).
const int64_t batch_size = X_ptr->Shape()[0];
const int64_t num_rois = batch_indices_dims[0];

Expand Down
3 changes: 2 additions & 1 deletion onnxruntime/core/providers/cuda/object_detection/roialign.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ Status RoiAlign<T>::ComputeInternal(OpKernelContext* context) const {
reinterpret_cast<typename ToCudaType<T>::MappedType*>(Y.MutableData<T>()),
this->mode_ == RoiAlignMode::avg,
this->half_pixel_,
batch_indices_ptr->Data<int64_t>());
batch_indices_ptr->Data<int64_t>(),
x_dims[0]); // batch_size
}

return Status::OK();
Expand Down
53 changes: 32 additions & 21 deletions onnxruntime/core/providers/cuda/object_detection/roialign_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ __global__ void RoIAlignForward(
T* top_data,
const bool is_mode_avg,
const bool half_pixel,
const int64_t* batch_indices_ptr) {
const int64_t* batch_indices_ptr,
const int64_t batch_size) {
for (size_t index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
Expand All @@ -106,6 +107,13 @@ __global__ void RoIAlignForward(
// RoI could have 4 or 5 columns
const T* offset_bottom_rois = bottom_rois + n * roi_cols;
const auto roi_batch_ind = batch_indices_ptr[n];
// Validate batch_indices values are within [0, batch_size).
// If the index is out of range, we set the output to 0 for this RoI element.
if (roi_batch_ind < 0 || roi_batch_ind >= batch_size) {
CUDA_KERNEL_ASSERT(false && "batch_indices values are out of range");
top_data[index] = 0;
continue;
}

// Do not using rounding; this implementation detail is critical
T roi_offset = half_pixel ? T(0.5) : T(0);
Expand Down Expand Up @@ -189,7 +197,8 @@ void RoiAlignImpl(
T* top_data,
const bool is_mode_avg,
const bool half_pixel,
const int64_t* batch_indices_ptr) {
const int64_t* batch_indices_ptr,
const int64_t batch_size) {
int blocksPerGrid = (int)(ceil(static_cast<float>(nthreads) / GridDim::maxThreadsPerBlock));
RoIAlignForward<T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, stream>>>(
nthreads,
Expand All @@ -206,27 +215,29 @@ void RoiAlignImpl(
top_data,
is_mode_avg,
half_pixel,
batch_indices_ptr);
batch_indices_ptr,
batch_size);
}

#define SPECIALIZED_IMPL(T) \
template void RoiAlignImpl<T>( \
cudaStream_t stream, \
const int64_t nthreads, \
const T* bottom_data, \
const T spatial_scale, \
const int64_t channels, \
const int64_t height, \
const int64_t width, \
const int64_t pooled_height, \
const int64_t pooled_width, \
const int64_t sampling_ratio, \
const T* bottom_rois, \
int64_t roi_cols, \
T* top_data, \
const bool is_mode_avg, \
const bool half_pixel, \
const int64_t* batch_indices_ptr);
#define SPECIALIZED_IMPL(T) \
template void RoiAlignImpl<T>( \
cudaStream_t stream, \
const int64_t nthreads, \
const T* bottom_data, \
const T spatial_scale, \
const int64_t channels, \
const int64_t height, \
const int64_t width, \
const int64_t pooled_height, \
const int64_t pooled_width, \
const int64_t sampling_ratio, \
const T* bottom_rois, \
int64_t roi_cols, \
T* top_data, \
const bool is_mode_avg, \
const bool half_pixel, \
const int64_t* batch_indices_ptr, \
const int64_t batch_size);

SPECIALIZED_IMPL(float)
SPECIALIZED_IMPL(double)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ void RoiAlignImpl(
T* top_data,
const bool is_mode_avg,
const bool half_pixel,
const int64_t* batch_indices_ptr);
const int64_t* batch_indices_ptr,
const int64_t batch_size); // batch size of the input tensor X

} // namespace cuda
} // namespace onnxruntime
52 changes: 52 additions & 0 deletions onnxruntime/test/providers/cpu/object_detection/roialign_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -854,5 +854,57 @@ TEST(RoiAlignTest, BatchIndicesNegative) {
execution_providers.push_back(DefaultCpuExecutionProvider());
test.Run(OpTester::ExpectResult::kExpectFailure, "batch_indices value -1 at index 0 is out of range [0, 1)", {}, nullptr, &execution_providers);
}

TEST(RoiAlignTest, BatchIndicesOutOfRange_CUDA) {
#if !defined(NDEBUG)
GTEST_SKIP() << "Skipping in Debug builds because CUDA device-side asserts poison the CUDA context.";
#else
auto cuda_ep = DefaultCudaExecutionProvider();
if (cuda_ep.get() == nullptr) {
GTEST_SKIP() << "Skipping because there is no CUDA execution provider available.";
}

OpTester test("RoiAlign", 10);
test.AddAttribute<int64_t>("output_height", 2);
test.AddAttribute<int64_t>("output_width", 2);
test.AddAttribute<int64_t>("sampling_ratio", 2);
test.AddAttribute<float>("spatial_scale", 1.0f);

test.AddInput<float>("X", {1, 1, 4, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
test.AddInput<float>("rois", {1, 4}, {0, 0, 3, 3});
test.AddInput<int64_t>("batch_indices", {1}, {1}); // batch_size is 1, so 1 is out of range
test.AddOutput<float>("Y", {1, 1, 2, 2}, {0.f, 0.f, 0.f, 0.f});

std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(std::move(cuda_ep));
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
#endif
}

TEST(RoiAlignTest, BatchIndicesNegative_CUDA) {
#if !defined(NDEBUG)
GTEST_SKIP() << "Skipping in Debug builds because CUDA device-side asserts poison the CUDA context.";
#else
auto cuda_ep = DefaultCudaExecutionProvider();
if (cuda_ep.get() == nullptr) {
GTEST_SKIP() << "Skipping because there is no CUDA execution provider available.";
}

OpTester test("RoiAlign", 10);
test.AddAttribute<int64_t>("output_height", 2);
test.AddAttribute<int64_t>("output_width", 2);
test.AddAttribute<int64_t>("sampling_ratio", 2);
test.AddAttribute<float>("spatial_scale", 1.0f);

test.AddInput<float>("X", {1, 1, 4, 4}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
test.AddInput<float>("rois", {1, 4}, {0, 0, 3, 3});
test.AddInput<int64_t>("batch_indices", {1}, {-1});
test.AddOutput<float>("Y", {1, 1, 2, 2}, {0.f, 0.f, 0.f, 0.f});

std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(std::move(cuda_ep));
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
#endif
}
} // namespace test
} // namespace onnxruntime
Loading