Add nms op and batched_nms api (#40962)

* add nms op and batched_nms api
PaddlePaddle · Apr 5, 2022 · 7554f42 · 7554f42
1 parent 510347f
commit 7554f42
Show file tree

Hide file tree

Showing 9 changed files with 740 additions and 0 deletions.
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -66,6 +66,7 @@ detection_library(yolo_box_op SRCS yolo_box_op.cc)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu)
 detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc)
+detection_library(nms_op SRCS nms_op.cc nms_op.cu)
 
 if(WITH_GPU OR WITH_ROCM)
   set(TMPDEPS memory)

diff --git a/paddle/fluid/operators/detection/nms_op.cc b/paddle/fluid/operators/detection/nms_op.cc
@@ -0,0 +1,147 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/nms_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class NMSOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Boxes",
+             "(Tensor) "
+             "Boxes is a Tensor with shape [N, 4] "
+             "N is the number of boxes "
+             "in last dimension in format [x1, x2, y1, y2] "
+             "the relation should be ``0 <= x1 < x2 && 0 <= y1 < y2``.");
+
+    AddOutput("KeepBoxesIdxs",
+              "(Tensor) "
+              "KeepBoxesIdxs is a Tensor with shape [N] ");
+    AddAttr<float>(
+        "iou_threshold",
+        "iou_threshold is a threshold value used to compress similar boxes "
+        "boxes with IoU > iou_threshold will be considered as overlapping "
+        "and just one of them can be kept.")
+        .SetDefault(1.0f)
+        .AddCustomChecker([](const float& iou_threshold) {
+          PADDLE_ENFORCE_LE(iou_threshold, 1.0f,
+                            platform::errors::InvalidArgument(
+                                "iou_threshold should less equal than 1.0 "
+                                "but got %f",
+                                iou_threshold));
+          PADDLE_ENFORCE_GE(iou_threshold, 0.0f,
+                            platform::errors::InvalidArgument(
+                                "iou_threshold should greater equal than 0.0 "
+                                "but got %f",
+                                iou_threshold));
+        });
+    AddComment(R"DOC(
+                NMS Operator.
+                This Operator is used to perform Non-Maximum Compress for input boxes.
+                Indices of boxes kept by NMS will be sorted by scores and output.
+            )DOC");
+  }
+};
+
+class NMSOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Boxes"), "Input", "Boxes", "NMS");
+    OP_INOUT_CHECK(ctx->HasOutput("KeepBoxesIdxs"), "Output", "KeepBoxesIdxs",
+                   "NMS");
+
+    auto boxes_dim = ctx->GetInputDim("Boxes");
+    PADDLE_ENFORCE_EQ(boxes_dim.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "The Input Boxes must be 2-dimention "
+                          "whose shape must be [N, 4] "
+                          "N is the number of boxes "
+                          "in last dimension in format [x1, x2, y1, y2]. "));
+    auto num_boxes = boxes_dim[0];
+
+    ctx->SetOutputDim("KeepBoxesIdxs", {num_boxes});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Boxes"), ctx.GetPlace());
+  }
+};
+
+template <typename T>
+static void NMS(const T* boxes_data, int64_t* output_data, float threshold,
+                int64_t num_boxes) {
+  auto num_masks = CeilDivide(num_boxes, 64);
+  std::vector<uint64_t> masks(num_masks, 0);
+
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (masks[i / 64] & 1ULL << (i % 64)) continue;
+    T box_1[4];
+    for (int k = 0; k < 4; ++k) {
+      box_1[k] = boxes_data[i * 4 + k];
+    }
+    for (int64_t j = i + 1; j < num_boxes; ++j) {
+      if (masks[j / 64] & 1ULL << (j % 64)) continue;
+      T box_2[4];
+      for (int k = 0; k < 4; ++k) {
+        box_2[k] = boxes_data[j * 4 + k];
+      }
+      bool is_overlap = CalculateIoU<T>(box_1, box_2, threshold);
+      if (is_overlap) {
+        masks[j / 64] |= 1ULL << (j % 64);
+      }
+    }
+  }
+
+  int64_t output_data_idx = 0;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (masks[i / 64] & 1ULL << (i % 64)) continue;
+    output_data[output_data_idx++] = i;
+  }
+
+  for (; output_data_idx < num_boxes; ++output_data_idx) {
+    output_data[output_data_idx] = 0;
+  }
+}
+
+template <typename T>
+class NMSKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* boxes = context.Input<Tensor>("Boxes");
+    Tensor* output = context.Output<Tensor>("KeepBoxesIdxs");
+    int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
+    auto threshold = context.template Attr<float>("iou_threshold");
+    NMS<T>(boxes->data<T>(), output_data, threshold, boxes->dims()[0]);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    nms, ops::NMSOp, ops::NMSOpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(nms, ops::NMSKernel<float>, ops::NMSKernel<double>);
diff --git a/paddle/fluid/operators/detection/nms_op.cu b/paddle/fluid/operators/detection/nms_op.cu
@@ -0,0 +1,108 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/operators/detection/nms_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+
+static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+static __global__ void NMS(const T* boxes_data, float threshold,
+                           int64_t num_boxes, uint64_t* masks) {
+  auto raw_start = blockIdx.y;
+  auto col_start = blockIdx.x;
+  if (raw_start > col_start) return;
+
+  const int raw_last_storage =
+      min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
+  const int col_last_storage =
+      min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+  if (threadIdx.x < raw_last_storage) {
+    uint64_t mask = 0;
+    auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
+    const T* current_box = boxes_data + current_box_idx * 4;
+    for (int i = 0; i < col_last_storage; ++i) {
+      const T* target_box = boxes_data + (col_start * threadsPerBlock + i) * 4;
+      if (CalculateIoU<T>(current_box, target_box, threshold)) {
+        mask |= 1ULL << i;
+      }
+    }
+    const int blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
+    masks[current_box_idx * blocks_per_line + col_start] = mask;
+  }
+}
+
+template <typename T>
+class NMSCudaKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* boxes = context.Input<Tensor>("Boxes");
+    Tensor* output = context.Output<Tensor>("KeepBoxesIdxs");
+    auto* output_data = output->mutable_data<int64_t>(context.GetPlace());
+
+    auto threshold = context.template Attr<float>("iou_threshold");
+    const int64_t num_boxes = boxes->dims()[0];
+    const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
+
+    dim3 block(threadsPerBlock);
+    dim3 grid(blocks_per_line, blocks_per_line);
+
+    auto mask_data =
+        memory::Alloc(context.cuda_device_context(),
+                      num_boxes * blocks_per_line * sizeof(uint64_t));
+    uint64_t* mask_dev = reinterpret_cast<uint64_t*>(mask_data->ptr());
+    NMS<T><<<grid, block, 0, context.cuda_device_context().stream()>>>(
+        boxes->data<T>(), threshold, num_boxes, mask_dev);
+
+    std::vector<uint64_t> mask_host(num_boxes * blocks_per_line);
+    memory::Copy(platform::CPUPlace(), mask_host.data(), context.GetPlace(),
+                 mask_dev, num_boxes * blocks_per_line * sizeof(uint64_t),
+                 context.cuda_device_context().stream());
+
+    std::vector<int64_t> remv(blocks_per_line);
+
+    std::vector<int64_t> keep_boxes_idxs(num_boxes);
+    int64_t* output_host = keep_boxes_idxs.data();
+
+    int64_t last_box_num = 0;
+    for (int64_t i = 0; i < num_boxes; ++i) {
+      auto remv_element_id = i / threadsPerBlock;
+      auto remv_bit_id = i % threadsPerBlock;
+      if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
+        output_host[last_box_num++] = i;
+        uint64_t* current_mask = mask_host.data() + i * blocks_per_line;
+        for (auto j = remv_element_id; j < blocks_per_line; ++j) {
+          remv[j] |= current_mask[j];
+        }
+      }
+    }
+    memory::Copy(context.GetPlace(), output_data, platform::CPUPlace(),
+                 output_host, sizeof(int64_t) * num_boxes,
+                 context.cuda_device_context().stream());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(nms, ops::NMSCudaKernel<float>,
+                        ops::NMSCudaKernel<double>);
diff --git a/paddle/fluid/operators/detection/nms_op.h b/paddle/fluid/operators/detection/nms_op.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+HOSTDEVICE static inline int64_t CeilDivide(int64_t n, int64_t m) {
+  return (n + m - 1) / m;
+}
+
+template <typename T>
+HOSTDEVICE inline bool CalculateIoU(const T* const box_1, const T* const box_2,
+                                    const float threshold) {
+  auto box_1_x0 = box_1[0], box_1_y0 = box_1[1];
+  auto box_1_x1 = box_1[2], box_1_y1 = box_1[3];
+  auto box_2_x0 = box_2[0], box_2_y0 = box_2[1];
+  auto box_2_x1 = box_2[2], box_2_y1 = box_2[3];
+
+  auto inter_box_x0 = box_1_x0 > box_2_x0 ? box_1_x0 : box_2_x0;
+  auto inter_box_y0 = box_1_y0 > box_2_y0 ? box_1_y0 : box_2_y0;
+  auto inter_box_x1 = box_1_x1 < box_2_x1 ? box_1_x1 : box_2_x1;
+  auto inter_box_y1 = box_1_y1 < box_2_y1 ? box_1_y1 : box_2_y1;
+
+  auto inter_width =
+      inter_box_x1 - inter_box_x0 > 0 ? inter_box_x1 - inter_box_x0 : 0;
+  auto inter_height =
+      inter_box_y1 - inter_box_y0 > 0 ? inter_box_y1 - inter_box_y0 : 0;
+  auto inter_area = inter_width * inter_height;
+  auto union_area = (box_1_x1 - box_1_x0) * (box_1_y1 - box_1_y0) +
+                    (box_2_x1 - box_2_x0) * (box_2_y1 - box_2_y0) - inter_area;
+  return inter_area / union_area > threshold;
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -234,6 +234,7 @@ endif()
 
 if(WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_complex_matmul)
+    LIST(REMOVE_ITEM TEST_OPS test_ops_nms)
 endif()
 
 LIST(REMOVE_ITEM TEST_OPS test_fleet_checkpoint)