From 3a43b650b2b2f97874333df850133396a517a5f3 Mon Sep 17 00:00:00 2001 From: Kelvin Choi Date: Thu, 23 May 2024 22:28:23 +0900 Subject: [PATCH] [GPU] Add NMS_Gather ops --- .../primitives/non_max_suppression.hpp | 25 ++++ .../mark_runtime_skippable_nodes.cpp | 6 + .../graph/impls/cpu/non_max_suppression.cpp | 52 ++++++++ .../src/graph/impls/cpu/register.cpp | 1 + .../src/graph/impls/cpu/register.hpp | 1 + .../graph/include/non_max_suppression_inst.h | 34 +++++ .../intel_gpu/src/graph/layout_optimizer.cpp | 2 + .../src/graph/non_max_suppression.cpp | 92 +++++++++++++ src/plugins/intel_gpu/src/graph/program.cpp | 2 + .../src/kernel_selector/common_types.h | 1 + .../src/plugin/ops/non_max_suppression.cpp | 22 +++- .../test_cases/non_max_suppression_test.cpp | 123 ++++++++++++++++++ 12 files changed, 359 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/non_max_suppression.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/non_max_suppression.hpp index e7a7a7287d5e05..d5464d6f1d244d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/non_max_suppression.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/non_max_suppression.hpp @@ -156,4 +156,29 @@ struct non_max_suppression : public primitive_base { ib >> make_data(&rotation, sizeof(rotation)); } }; + +struct non_max_suppression_gather : primitive_base { + CLDNN_DECLARE_PRIMITIVE(non_max_suppression_gather) + + /// @brief Constructs non_max_suppression_gather primitive. + /// @param id This primitive id. + /// @param inputs Input primitives ids. + non_max_suppression_gather(const primitive_id& id, + const std::vector& inputs, + const size_t num_outputs = 1) + : primitive_base(id, inputs, {padding()}, {optional_data_type()}, num_outputs) {} + + size_t hash() const override { + size_t seed = primitive::hash(); + return seed; + } + + bool operator==(const primitive& rhs) const override { + if (!compare_common_params(rhs)) { + return false; + } + + return true; + } +}; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp index e432248ac46669..f7614767449d15 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp @@ -4,6 +4,7 @@ #include "pass_manager.h" #include "gather_inst.h" +#include "non_max_suppression_inst.h" #include "permute_inst.h" #include "strided_slice_inst.h" #include "kv_cache_inst.h" @@ -42,6 +43,11 @@ void mark_runtime_skippable_nodes::run(program& p) { GPU_DEBUG_TRACE_DETAIL << "[mark_runtime_skippable_nodes] : " << node.id() << " can_be_optimized" << std::endl; } }); + program_helpers::do_for_types(*node, [](non_max_suppression_gather_node& node){ + node.can_be_optimized(true); + node.set_runtime_skippable(true); + GPU_DEBUG_TRACE_DETAIL << "[mark_runtime_skippable_nodes] : " << node.id() << " can_be_optimized" << std::endl; + }); program_helpers::do_for_types(*node, [](permute_node& node){ // if node is already optimized at compilation time, do not handle at runtime if (node.can_be_optimized()) diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp index f4793699de4120..f38efcd5c0d30c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp @@ -440,6 +440,58 @@ attach_non_max_suppression_impl::attach_non_max_suppression_impl() { } } // namespace detail + +struct non_max_suppression_gather_impl : typed_primitive_impl { + using parent = typed_primitive_impl; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::cpu::non_max_suppression_gather_impl) + + std::unique_ptr clone() const override { + return make_unique(*this); + } + + non_max_suppression_gather_impl() : parent("non_max_suppression_gather_impl") {} + + event::ptr execute_impl(const std::vector& events, typed_primitive_inst& instance) override { + auto& stream = instance.get_network().get_stream(); + + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + + if (!pass_through_events) { + for (auto e : events) { + e->wait(); + } + } + + if (pass_through_events) { + if (events.size() > 1) { + return stream.group_events(events); + } else if (events.size() == 1) { + return events[0]; + } + } + + return stream.create_user_event(true); + } + + static std::unique_ptr create(const non_max_suppression_gather_node&, const kernel_impl_params&) { + return make_unique(); + } + void init_kernels(const kernels_cache&, const kernel_impl_params&) override {} +}; + +namespace detail { + +attach_non_max_suppression_gather_impl::attach_non_max_suppression_gather_impl() { + implementation_map::add(impl_types::cpu, non_max_suppression_gather_impl::create, { + std::make_tuple(data_types::i32, format::bfyx), + std::make_tuple(data_types::f16, format::bfyx), + std::make_tuple(data_types::f32, format::bfyx), + }); +} + +} // namespace detail + } // namespace cpu } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp index c70b39cc9de7f1..2b0dc5b212158c 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp @@ -16,6 +16,7 @@ void register_implementations() { REGISTER_CPU(proposal); REGISTER_CPU(read_value); REGISTER_CPU(non_max_suppression); + REGISTER_CPU(non_max_suppression_gather); REGISTER_CPU(shape_of); REGISTER_CPU(concatenation); REGISTER_CPU(gather); diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp index aaa56678d08ca1..cb89eae29d8c56 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp @@ -40,6 +40,7 @@ REGISTER_CPU(assign); REGISTER_CPU(proposal); REGISTER_CPU(read_value); REGISTER_CPU(non_max_suppression); +REGISTER_CPU(non_max_suppression_gather); REGISTER_CPU(detection_output); REGISTER_CPU(shape_of); REGISTER_CPU(concatenation); diff --git a/src/plugins/intel_gpu/src/graph/include/non_max_suppression_inst.h b/src/plugins/intel_gpu/src/graph/include/non_max_suppression_inst.h index 250708f97cf858..de720e53d022ed 100644 --- a/src/plugins/intel_gpu/src/graph/include/non_max_suppression_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/non_max_suppression_inst.h @@ -186,4 +186,38 @@ class typed_primitive_inst : public typed_primitive_inst_ba using non_max_suppression_inst = typed_primitive_inst; +template <> +struct typed_program_node : typed_program_node_base { + using parent = typed_program_node_base; + using parent::parent; + + bool generates_dynamic_output() const override { + return true; + } + + std::vector get_shape_infer_dependencies() const override { return {0, 1, 2}; } +}; + +using non_max_suppression_gather_node = typed_program_node; + +template <> +class typed_primitive_inst : public typed_primitive_inst_base { +public: + using parent = typed_primitive_inst_base; + using parent::parent; + + static layout calc_output_layout(const non_max_suppression_gather_node& node, const kernel_impl_params& impl_param); + template + static std::vector calc_output_layouts(const non_max_suppression_gather_node& node, const kernel_impl_params& impl_param); + static std::string to_string(const non_max_suppression_gather_node& node); + + typed_primitive_inst(network& network, non_max_suppression_gather_node const& node); + void update_output_memory() override; + +private: + void on_execute() override; +}; + +using non_max_suppression_gather_inst = typed_primitive_inst; + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 07c66b3b983c54..651445b3241b3b 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -1568,6 +1568,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format } } } + } else if (node.is_type()) { + return impl_types::cpu; } else if (node.is_type()) { if (!_optimization_attributes.use_onednn_impls) return impl_types::ocl; diff --git a/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp b/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp index 29a707ea53d3d9..a21cd75549f8b0 100644 --- a/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp +++ b/src/plugins/intel_gpu/src/graph/non_max_suppression.cpp @@ -11,6 +11,10 @@ #include "nms_shape_inference.hpp" namespace cldnn { + +// ----------------------------------------------- +// non_max_suppression +// ----------------------------------------------- GPU_DEFINE_PRIMITIVE_TYPE_ID(non_max_suppression) layout non_max_suppression_inst::calc_output_layout(non_max_suppression_node const& node, kernel_impl_params const& impl_param) { @@ -81,4 +85,92 @@ std::string non_max_suppression_inst::to_string(non_max_suppression_node const& return description.str(); } +// ----------------------------------------------- +// non_max_suppression_gather +// ----------------------------------------------- +GPU_DEFINE_PRIMITIVE_TYPE_ID(non_max_suppression_gather) + +layout non_max_suppression_gather_inst::calc_output_layout(non_max_suppression_gather_node const& node, kernel_impl_params const& impl_param) { + OPENVINO_THROW("Only calc_output_layouts should be used!"); +} + +template +std::vector non_max_suppression_gather_inst::calc_output_layouts(non_max_suppression_gather_node const& /*node*/, + const kernel_impl_params& impl_param) { + std::vector layouts; + + auto desc = impl_param.typed_desc(); + std::vector output_shapes = { ShapeType{}, ShapeType{}, ShapeType{} }; + + auto& memory_deps = impl_param.memory_deps; + if (memory_deps.count(0)) { + auto actual_output = memory_deps.at(0); + cldnn::mem_lock actual_output_lock(actual_output, impl_param.get_stream()); + + auto output_ps = actual_output->get_layout().get_partial_shape(); + auto b = output_ps[0].get_length(); + auto f = output_ps[1].get_length(); + + // find valid data size + auto output_data = actual_output_lock.data(); + int64_t actual_valid_num = b; + for (int64_t i = 0; i < b ; i += 1) { + if (output_data[i * f] == -1) { + actual_valid_num = i; + break; + } + } + + output_shapes[0] = output_shapes[1] = ShapeType{actual_valid_num, f}; + output_shapes[2] = ShapeType{1}; + } else { + output_shapes[0] = output_shapes[1] = ShapeType{ov::Dimension::dynamic(), 3}; + output_shapes[2] = ShapeType{1}; + } + + for (size_t i = 0; i < desc->num_outputs; ++i) { + layouts.push_back({output_shapes[i], + impl_param.get_input_layout(i).data_type, + format::get_default_format(output_shapes[i].size())}); + } + return layouts; +} + +template std::vector non_max_suppression_gather_inst::calc_output_layouts(non_max_suppression_gather_node const& node, + const kernel_impl_params& impl_param); + +std::string non_max_suppression_gather_inst::to_string(non_max_suppression_gather_node const& node) { + auto desc = node.get_primitive(); + auto node_info = node.desc_to_json(); + + json_composite info; + + node_info->add("non max suppression gather info", info); + + std::stringstream description; + node_info->dump(description); + return description.str(); +} + +void non_max_suppression_gather_inst::on_execute() { + update_output_memory(); +} + +void non_max_suppression_gather_inst::update_output_memory() { + if (!can_be_optimized()) + return; + + for (size_t i = 0; i < inputs_memory_count(); i++) { + if (node->get_program().is_new_shape_infer() && input_memory_ptr(i) == nullptr) + return; + + if (output_memory_ptr(i) != nullptr && _network.get_engine().is_the_same_buffer(output_memory(i), input_memory(i))) + return; + + _outputs[i] = {_network.get_engine().reinterpret_buffer(input_memory(i), _impl_params->get_output_layout(i))}; + } +} + +non_max_suppression_gather_inst::typed_primitive_inst(network& network, non_max_suppression_gather_node const& node) : parent(network, node) {} + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 1c526ea38188d2..4dbd438304ba1d 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -1496,6 +1496,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) { prim.type() != cldnn::broadcast::type_id() && prim.type() != cldnn::ctc_loss::type_id() && prim.type() != cldnn::non_max_suppression::type_id() && + prim.type() != cldnn::non_max_suppression_gather::type_id() && prim.type() != cldnn::roi_align::type_id() && prim.type() != cldnn::matrix_nms::type_id() && prim.type() != cldnn::adaptive_pooling::type_id() && @@ -1548,6 +1549,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) { prim.type() != cldnn::quantize::type_id() && prim.type() != cldnn::ctc_loss::type_id() && prim.type() != cldnn::non_max_suppression::type_id() && + prim.type() != cldnn::non_max_suppression_gather::type_id() && prim.type() != cldnn::roi_align::type_id() && prim.type() != cldnn::matrix_nms::type_id() && prim.type() != cldnn::adaptive_pooling::type_id() && diff --git a/src/plugins/intel_gpu/src/kernel_selector/common_types.h b/src/plugins/intel_gpu/src/kernel_selector/common_types.h index 408cfc2b5e7719..d0aba8554eccc7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/common_types.h +++ b/src/plugins/intel_gpu/src/kernel_selector/common_types.h @@ -77,6 +77,7 @@ enum class KernelType { EXTRACT_IMAGE_PATCHES, LOOP, NON_MAX_SUPPRESSION, + NON_MAX_SUPPRESSION_GATHER, DETECTION_OUTPUT, EXPERIMENTAL_DETECTRON_DETECTION_OUTPUT, EXPERIMENTAL_DETECTRON_GENERATE_PROPOSALS_SINGLE_IMAGE, diff --git a/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp b/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp index a46f30c418f00a..38c59ba044d404 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp @@ -54,9 +54,9 @@ static void CreateNonMaxSuppressionIEInternalOp(ProgramBuilder& p, const std::sh auto boxesShape = op->get_input_partial_shape(0); size_t num_outputs = op->get_output_size(); if (p.use_new_shape_infer()) { - auto nonMaxSuppressionLayerName = layer_type_name_ID(op); + auto NMSLayerName = layer_type_name_ID(op); auto prim = cldnn::non_max_suppression( - nonMaxSuppressionLayerName, + NMSLayerName, reordered_inputs[0], reordered_inputs[1], 0, @@ -78,6 +78,24 @@ static void CreateNonMaxSuppressionIEInternalOp(ProgramBuilder& p, const std::sh } p.add_primitive(*op, prim); + + auto NMSGatherLayerName = layer_type_name_ID(op) + "_NMSGather"; + std::vector nms_gather_inputs; + const std::vector nms_gather_input_list = { + cldnn::input_info(NMSLayerName, 0), + cldnn::input_info(NMSLayerName, 1), + cldnn::input_info(NMSLayerName, 2) + }; + for (size_t i = 0; i < num_outputs; i++) { + nms_gather_inputs.push_back(nms_gather_input_list[i]); + } + + auto nms_gather_prim = cldnn::non_max_suppression_gather( + NMSGatherLayerName, + nms_gather_inputs, + num_outputs); + + p.add_primitive(*op, nms_gather_prim); } else { auto outputIndices = op->get_output_partial_shape(0)[0].get_length(); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/non_max_suppression_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/non_max_suppression_test.cpp index adbb0c029c8bb4..3d7647ee2f53f1 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/non_max_suppression_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/non_max_suppression_test.cpp @@ -572,6 +572,125 @@ struct non_max_suppression_basic : public testing::Test { } } + void test_nms_gather_score_threshold(bool is_caching_test) { + auto& engine = tests::get_test_engine(); + + auto num_per_class_mem = engine.allocate_memory(layout(data_types::f32, format::bfyx, tensor(batch(1)))); + tests::set_values(num_per_class_mem, {3.f}); + auto iou_threshold_mem = engine.allocate_memory(layout(data_types::f32, format::bfyx, tensor(batch(1)))); + tests::set_values(iou_threshold_mem, {0.4f}); + auto score_threshold_mem = engine.allocate_memory(layout(data_types::f32, format::bfyx, tensor(batch(1)))); + tests::set_values(score_threshold_mem, {0.4f}); + + const auto l_boxes = this->boxes_layout; + const auto l_scores = this->scores_layout; + + topology topo; + topo.add(input_layout("boxes", layout{ov::PartialShape{l_boxes.batch(), l_boxes.feature(), l_boxes.spatial(1)}, l_boxes.data_type, l_boxes.format})); + topo.add(input_layout("scores", layout{ov::PartialShape{l_scores.batch(), l_scores.feature(), l_scores.spatial(1)}, l_scores.data_type, l_scores.format})); + topo.add(data("num_per_class", num_per_class_mem)); + topo.add(data("iou_threshold", iou_threshold_mem)); + topo.add(data("score_threshold", score_threshold_mem)); + topo.add(reorder("reformat_boxes", input_info("boxes"), this->layout_format, this->data_type)); + topo.add(reorder("reformat_scores", input_info("scores"), this->layout_format, this->data_type)); + + auto nms = non_max_suppression("nms", + input_info("reformat_boxes"), + input_info("reformat_scores"), + this->batch_size * this->classes_num * this->boxes_num, + false, + true, + "num_per_class", + "iou_threshold", + "score_threshold", + "", "", "", 3); + auto output_data_type = this->data_type; + nms.output_data_types = {optional_data_type{}, optional_data_type{output_data_type}, optional_data_type{}}; + nms.output_paddings = {padding(), padding(), padding()}; + + topo.add(nms); + topo.add(non_max_suppression_gather("nms_gather", + {input_info("nms", 0), + input_info("nms", 1), + input_info("nms", 2)}, + 3)); + topo.add(reorder("plane_nms0", input_info("nms_gather", 0), format::bfyx, cldnn::data_types::i32)); + topo.add(reorder("plane_nms1", input_info("nms_gather", 1), format::bfyx, this->data_type)); + topo.add(reorder("plane_nms2", input_info("nms_gather", 2), format::bfyx, cldnn::data_types::i32)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + cldnn::network::ptr net = get_network(engine, topo, config, get_test_stream_ptr(), is_caching_test); + + auto boxes_mem = this->get_boxes_memory(engine); + auto scores_mem = this->get_scores_memory(engine); + + net->set_input_data("boxes", boxes_mem); + net->set_input_data("scores", scores_mem); + + auto result = net->execute(); + + // output 0 + std::vector expected_out0 = { + 0, 0, 2, + 0, 1, 0, + 1, 0, 2, + 0, 0, 1, + 1, 0, 1 + }; + + auto out_mem0 = result.at("plane_nms0").get_memory(); + cldnn::mem_lock out0_ptr(out_mem0, get_test_stream()); + + ASSERT_EQ(expected_out0.size(), out0_ptr.size()); + for (size_t i = 0; i < out0_ptr.size(); ++i) { + ASSERT_EQ(expected_out0[i], out0_ptr[i]) << "at i = " << i; + } + + // output 1 + if (this->data_type == cldnn::data_types::f32) { + std::vector expected_out1 = { + 0.0f, 0.0f, 0.9f, + 0.0f, 1.0f, 0.9f, + 1.0f, 0.0f, 0.8f, + 0.0f, 0.0f, 0.7f, + 1.0f, 0.0f, 0.5f + }; + auto out_mem1 = result.at("plane_nms1").get_memory(); + cldnn::mem_lock out1_ptr(out_mem1, get_test_stream()); + + ASSERT_EQ(expected_out1.size(), out1_ptr.size()); + for (size_t i = 0; i < out1_ptr.size(); ++i) { + ASSERT_EQ(expected_out1[i], out1_ptr[i]) << "at i = " << i; + } + } else if (this->data_type == cldnn::data_types::f16) { + std::vector expected_out1 = { + 0.0f, 0.0f, 0.899902f, + 0.0f, 1.0f, 0.899902f, + 1.0f, 0.0f, 0.799805f, + 0.0f, 0.0f, 0.700195f, + 1.0f, 0.0f, 0.5f + }; + auto out_mem1 = result.at("plane_nms1").get_memory(); + cldnn::mem_lock out1_ptr(out_mem1, get_test_stream()); + + ASSERT_EQ(expected_out1.size(), out1_ptr.size()); + for (size_t i = 0; i < out1_ptr.size(); ++i) { + ASSERT_EQ(expected_out1[i], out1_ptr[i]) << "at i = " << i; + } + } else { + GTEST_FAIL() << "Not supported data type."; + } + + // output 2 + auto out_mem2 = result.at("plane_nms2").get_memory(); + cldnn::mem_lock out2_ptr(out_mem2, get_test_stream()); + ASSERT_EQ(1, out2_ptr.size()); + ASSERT_EQ(5, out2_ptr[0]); + } + void test_soft_nms_sigma(bool is_caching_test) { auto& engine = tests::get_test_engine(); @@ -678,6 +797,10 @@ TYPED_TEST(non_max_suppression_basic, score_threshold) { this->test_score_threshold(false); } +TYPED_TEST(non_max_suppression_basic, nms_gather_score_threshold) { + this->test_nms_gather_score_threshold(false); +} + TYPED_TEST(non_max_suppression_basic, soft_nms_sigma) { this->test_soft_nms_sigma(false); }