Skip to content

Commit

Permalink
[GPU] Add NMS_Gather ops
Browse files Browse the repository at this point in the history
  • Loading branch information
kelvinchoi-intel committed Jul 8, 2024
1 parent 44e4e5d commit 3a43b65
Show file tree
Hide file tree
Showing 12 changed files with 359 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -156,4 +156,29 @@ struct non_max_suppression : public primitive_base<non_max_suppression> {
ib >> make_data(&rotation, sizeof(rotation));
}
};

struct non_max_suppression_gather : primitive_base<non_max_suppression_gather> {
CLDNN_DECLARE_PRIMITIVE(non_max_suppression_gather)

/// @brief Constructs non_max_suppression_gather primitive.
/// @param id This primitive id.
/// @param inputs Input primitives ids.
non_max_suppression_gather(const primitive_id& id,
const std::vector<input_info>& inputs,
const size_t num_outputs = 1)
: primitive_base(id, inputs, {padding()}, {optional_data_type()}, num_outputs) {}

size_t hash() const override {
size_t seed = primitive::hash();
return seed;
}

bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs)) {
return false;
}

return true;
}
};
} // namespace cldnn
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

#include "pass_manager.h"
#include "gather_inst.h"
#include "non_max_suppression_inst.h"
#include "permute_inst.h"
#include "strided_slice_inst.h"
#include "kv_cache_inst.h"
Expand Down Expand Up @@ -42,6 +43,11 @@ void mark_runtime_skippable_nodes::run(program& p) {
GPU_DEBUG_TRACE_DETAIL << "[mark_runtime_skippable_nodes] : " << node.id() << " can_be_optimized" << std::endl;
}
});
program_helpers::do_for_types<non_max_suppression_gather>(*node, [](non_max_suppression_gather_node& node){
node.can_be_optimized(true);
node.set_runtime_skippable(true);
GPU_DEBUG_TRACE_DETAIL << "[mark_runtime_skippable_nodes] : " << node.id() << " can_be_optimized" << std::endl;
});
program_helpers::do_for_types<permute>(*node, [](permute_node& node){
// if node is already optimized at compilation time, do not handle at runtime
if (node.can_be_optimized())
Expand Down
52 changes: 52 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,58 @@ attach_non_max_suppression_impl::attach_non_max_suppression_impl() {
}

} // namespace detail

struct non_max_suppression_gather_impl : typed_primitive_impl<non_max_suppression_gather> {
using parent = typed_primitive_impl<non_max_suppression_gather>;

DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::cpu::non_max_suppression_gather_impl)

std::unique_ptr<primitive_impl> clone() const override {
return make_unique<non_max_suppression_gather_impl>(*this);
}

non_max_suppression_gather_impl() : parent("non_max_suppression_gather_impl") {}

event::ptr execute_impl(const std::vector<event::ptr>& events, typed_primitive_inst<non_max_suppression_gather>& instance) override {
auto& stream = instance.get_network().get_stream();

const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph();

if (!pass_through_events) {
for (auto e : events) {
e->wait();
}
}

if (pass_through_events) {
if (events.size() > 1) {
return stream.group_events(events);
} else if (events.size() == 1) {
return events[0];
}
}

return stream.create_user_event(true);
}

static std::unique_ptr<primitive_impl> create(const non_max_suppression_gather_node&, const kernel_impl_params&) {
return make_unique<non_max_suppression_gather_impl>();
}
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
};

namespace detail {

attach_non_max_suppression_gather_impl::attach_non_max_suppression_gather_impl() {
implementation_map<non_max_suppression_gather>::add(impl_types::cpu, non_max_suppression_gather_impl::create, {
std::make_tuple(data_types::i32, format::bfyx),
std::make_tuple(data_types::f16, format::bfyx),
std::make_tuple(data_types::f32, format::bfyx),
});
}

} // namespace detail

} // namespace cpu
} // namespace cldnn

Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ void register_implementations() {
REGISTER_CPU(proposal);
REGISTER_CPU(read_value);
REGISTER_CPU(non_max_suppression);
REGISTER_CPU(non_max_suppression_gather);
REGISTER_CPU(shape_of);
REGISTER_CPU(concatenation);
REGISTER_CPU(gather);
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ REGISTER_CPU(assign);
REGISTER_CPU(proposal);
REGISTER_CPU(read_value);
REGISTER_CPU(non_max_suppression);
REGISTER_CPU(non_max_suppression_gather);
REGISTER_CPU(detection_output);
REGISTER_CPU(shape_of);
REGISTER_CPU(concatenation);
Expand Down
34 changes: 34 additions & 0 deletions src/plugins/intel_gpu/src/graph/include/non_max_suppression_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,4 +186,38 @@ class typed_primitive_inst<non_max_suppression> : public typed_primitive_inst_ba

using non_max_suppression_inst = typed_primitive_inst<non_max_suppression>;

template <>
struct typed_program_node<non_max_suppression_gather> : typed_program_node_base<non_max_suppression_gather> {
using parent = typed_program_node_base<non_max_suppression_gather>;
using parent::parent;

bool generates_dynamic_output() const override {
return true;
}

std::vector<size_t> get_shape_infer_dependencies() const override { return {0, 1, 2}; }
};

using non_max_suppression_gather_node = typed_program_node<non_max_suppression_gather>;

template <>
class typed_primitive_inst<non_max_suppression_gather> : public typed_primitive_inst_base<non_max_suppression_gather> {
public:
using parent = typed_primitive_inst_base<non_max_suppression_gather>;
using parent::parent;

static layout calc_output_layout(const non_max_suppression_gather_node& node, const kernel_impl_params& impl_param);
template <typename ShapeType>
static std::vector<layout> calc_output_layouts(const non_max_suppression_gather_node& node, const kernel_impl_params& impl_param);
static std::string to_string(const non_max_suppression_gather_node& node);

typed_primitive_inst(network& network, non_max_suppression_gather_node const& node);
void update_output_memory() override;

private:
void on_execute() override;
};

using non_max_suppression_gather_inst = typed_primitive_inst<non_max_suppression_gather>;

} // namespace cldnn
2 changes: 2 additions & 0 deletions src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1568,6 +1568,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
}
}
}
} else if (node.is_type<non_max_suppression_gather>()) {
return impl_types::cpu;
} else if (node.is_type<reorder>()) {
if (!_optimization_attributes.use_onednn_impls)
return impl_types::ocl;
Expand Down
92 changes: 92 additions & 0 deletions src/plugins/intel_gpu/src/graph/non_max_suppression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
#include "nms_shape_inference.hpp"

namespace cldnn {

// -----------------------------------------------
// non_max_suppression
// -----------------------------------------------
GPU_DEFINE_PRIMITIVE_TYPE_ID(non_max_suppression)

layout non_max_suppression_inst::calc_output_layout(non_max_suppression_node const& node, kernel_impl_params const& impl_param) {
Expand Down Expand Up @@ -81,4 +85,92 @@ std::string non_max_suppression_inst::to_string(non_max_suppression_node const&
return description.str();
}

// -----------------------------------------------
// non_max_suppression_gather
// -----------------------------------------------
GPU_DEFINE_PRIMITIVE_TYPE_ID(non_max_suppression_gather)

layout non_max_suppression_gather_inst::calc_output_layout(non_max_suppression_gather_node const& node, kernel_impl_params const& impl_param) {
OPENVINO_THROW("Only calc_output_layouts should be used!");
}

template<typename ShapeType>
std::vector<layout> non_max_suppression_gather_inst::calc_output_layouts(non_max_suppression_gather_node const& /*node*/,
const kernel_impl_params& impl_param) {
std::vector<layout> layouts;

auto desc = impl_param.typed_desc<non_max_suppression_gather>();
std::vector<ShapeType> output_shapes = { ShapeType{}, ShapeType{}, ShapeType{} };

auto& memory_deps = impl_param.memory_deps;
if (memory_deps.count(0)) {
auto actual_output = memory_deps.at(0);
cldnn::mem_lock<int32_t, mem_lock_type::read> actual_output_lock(actual_output, impl_param.get_stream());

auto output_ps = actual_output->get_layout().get_partial_shape();
auto b = output_ps[0].get_length();
auto f = output_ps[1].get_length();

// find valid data size
auto output_data = actual_output_lock.data();
int64_t actual_valid_num = b;
for (int64_t i = 0; i < b ; i += 1) {
if (output_data[i * f] == -1) {
actual_valid_num = i;
break;
}
}

output_shapes[0] = output_shapes[1] = ShapeType{actual_valid_num, f};
output_shapes[2] = ShapeType{1};
} else {
output_shapes[0] = output_shapes[1] = ShapeType{ov::Dimension::dynamic(), 3};
output_shapes[2] = ShapeType{1};
}

for (size_t i = 0; i < desc->num_outputs; ++i) {
layouts.push_back({output_shapes[i],
impl_param.get_input_layout(i).data_type,
format::get_default_format(output_shapes[i].size())});
}
return layouts;
}

template std::vector<layout> non_max_suppression_gather_inst::calc_output_layouts<ov::PartialShape>(non_max_suppression_gather_node const& node,
const kernel_impl_params& impl_param);

std::string non_max_suppression_gather_inst::to_string(non_max_suppression_gather_node const& node) {
auto desc = node.get_primitive();
auto node_info = node.desc_to_json();

json_composite info;

node_info->add("non max suppression gather info", info);

std::stringstream description;
node_info->dump(description);
return description.str();
}

void non_max_suppression_gather_inst::on_execute() {
update_output_memory();
}

void non_max_suppression_gather_inst::update_output_memory() {
if (!can_be_optimized())
return;

for (size_t i = 0; i < inputs_memory_count(); i++) {
if (node->get_program().is_new_shape_infer() && input_memory_ptr(i) == nullptr)
return;

if (output_memory_ptr(i) != nullptr && _network.get_engine().is_the_same_buffer(output_memory(i), input_memory(i)))
return;

_outputs[i] = {_network.get_engine().reinterpret_buffer(input_memory(i), _impl_params->get_output_layout(i))};
}
}

non_max_suppression_gather_inst::typed_primitive_inst(network& network, non_max_suppression_gather_node const& node) : parent(network, node) {}

} // namespace cldnn
2 changes: 2 additions & 0 deletions src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1496,6 +1496,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
prim.type() != cldnn::broadcast::type_id() &&
prim.type() != cldnn::ctc_loss::type_id() &&
prim.type() != cldnn::non_max_suppression::type_id() &&
prim.type() != cldnn::non_max_suppression_gather::type_id() &&
prim.type() != cldnn::roi_align::type_id() &&
prim.type() != cldnn::matrix_nms::type_id() &&
prim.type() != cldnn::adaptive_pooling::type_id() &&
Expand Down Expand Up @@ -1548,6 +1549,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
prim.type() != cldnn::quantize::type_id() &&
prim.type() != cldnn::ctc_loss::type_id() &&
prim.type() != cldnn::non_max_suppression::type_id() &&
prim.type() != cldnn::non_max_suppression_gather::type_id() &&
prim.type() != cldnn::roi_align::type_id() &&
prim.type() != cldnn::matrix_nms::type_id() &&
prim.type() != cldnn::adaptive_pooling::type_id() &&
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/kernel_selector/common_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ enum class KernelType {
EXTRACT_IMAGE_PATCHES,
LOOP,
NON_MAX_SUPPRESSION,
NON_MAX_SUPPRESSION_GATHER,
DETECTION_OUTPUT,
EXPERIMENTAL_DETECTRON_DETECTION_OUTPUT,
EXPERIMENTAL_DETECTRON_GENERATE_PROPOSALS_SINGLE_IMAGE,
Expand Down
22 changes: 20 additions & 2 deletions src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ static void CreateNonMaxSuppressionIEInternalOp(ProgramBuilder& p, const std::sh
auto boxesShape = op->get_input_partial_shape(0);
size_t num_outputs = op->get_output_size();
if (p.use_new_shape_infer()) {
auto nonMaxSuppressionLayerName = layer_type_name_ID(op);
auto NMSLayerName = layer_type_name_ID(op);
auto prim = cldnn::non_max_suppression(
nonMaxSuppressionLayerName,
NMSLayerName,
reordered_inputs[0],
reordered_inputs[1],
0,
Expand All @@ -78,6 +78,24 @@ static void CreateNonMaxSuppressionIEInternalOp(ProgramBuilder& p, const std::sh
}

p.add_primitive(*op, prim);

auto NMSGatherLayerName = layer_type_name_ID(op) + "_NMSGather";
std::vector<cldnn::input_info> nms_gather_inputs;
const std::vector<cldnn::input_info> nms_gather_input_list = {
cldnn::input_info(NMSLayerName, 0),
cldnn::input_info(NMSLayerName, 1),
cldnn::input_info(NMSLayerName, 2)
};
for (size_t i = 0; i < num_outputs; i++) {
nms_gather_inputs.push_back(nms_gather_input_list[i]);
}

auto nms_gather_prim = cldnn::non_max_suppression_gather(
NMSGatherLayerName,
nms_gather_inputs,
num_outputs);

p.add_primitive(*op, nms_gather_prim);
} else {
auto outputIndices = op->get_output_partial_shape(0)[0].get_length();

Expand Down
Loading

0 comments on commit 3a43b65

Please sign in to comment.