Skip to content

Commit

Permalink
Reduce decomposition
Browse files Browse the repository at this point in the history
  • Loading branch information
v-Golubev committed Nov 29, 2023
1 parent ef4178e commit f06ddd0
Show file tree
Hide file tree
Showing 7 changed files with 288 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "pass.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

/**
* @interface ReduceMaxDecomposition
* @brief Decomposes Softmax to a range of low-level operations on linear IR
* @ingroup snippets
*/
class ReduceMaxDecomposition : public Pass {
public:
OPENVINO_RTTI("ReduceMaxDecomposition", "Pass")
explicit ReduceMaxDecomposition(size_t vector_size);
bool run(LinearIR& linear_ir) override;

private:
size_t m_vector_size;
};

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "pass.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

/**
* @interface ReduceSumDecomposition
* @brief Decomposes Softmax to a range of low-level operations on linear IR
* @ingroup snippets
*/
class ReduceSumDecomposition : public Pass {
public:
OPENVINO_RTTI("ReduceSumDecomposition", "Pass")
explicit ReduceSumDecomposition(size_t vector_size);
bool run(LinearIR& linear_ir) override;

private:
size_t m_vector_size;
};

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
2 changes: 1 addition & 1 deletion src/common/snippets/src/lowered/loop_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,7 @@ void LinearIR::LoopManager::insert_loop_id(const ExpressionPtr& expr, size_t new
OPENVINO_ASSERT(m_map.count(new_id) == 1, "Failed marking expression by Loop ID: the Loop with this ID hasn't registered");
auto& loop_ids = expr->m_loop_ids;
OPENVINO_ASSERT(std::find(loop_ids.cbegin(), loop_ids.cend(), new_id) == loop_ids.cend(),
"Expression cannot have several the same Loop IDs");
"Expression cannot have several identical Loop IDs");
auto insert_it = before ? loop_ids.cbegin() : loop_ids.cend();
if (target_id != SIZE_MAX) {
insert_it = std::find(loop_ids.cbegin(), loop_ids.cend(), target_id);
Expand Down
107 changes: 107 additions & 0 deletions src/common/snippets/src/lowered/pass/reduce_max_decomposition.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "snippets/lowered/pass/reduce_max_decomposition.hpp"

#include "snippets/lowered/linear_ir.hpp"
#include "snippets/lowered/loop_manager.hpp"
#include "snippets/lowered/pass/mark_loops.hpp"
#include "snippets/lowered/pass/iter_handler.hpp"
#include "snippets/snippets_isa.hpp"
#include "snippets/itt.hpp"

#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "openvino/pass/pattern/matcher.hpp"


namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

using LoopInfo = LinearIR::LoopManager::LoopInfo;

ReduceMaxDecomposition::ReduceMaxDecomposition(size_t vector_size) : m_vector_size{vector_size} {}

bool ReduceMaxDecomposition::run(LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ReduceMaxDecompositionLowered")
const auto& loop_manager = linear_ir.get_loop_manager();

bool modified = false;
for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
const auto& op = (*expr_it)->get_node();
if (!ov::is_type<ov::snippets::op::ReduceMax>(op))
continue;

const auto reduce = op;
const auto reduce_expr = *expr_it;
const auto& input_shape = reduce_expr->get_input_port_descriptor(0)->get_shape();
const auto work_amount = *(input_shape.rbegin());
const bool is_dynamic = reduce->is_dynamic();

// We need an iterator to the inserted element
auto push_node = [&](const std::shared_ptr<Node>& n) {
const auto expr = linear_ir.insert(expr_it, n);
if (is_dynamic)
expr->get()->updateShapes();
return std::make_pair(expr, n);
};
// Float constant values in byte representation
const auto fill_value = uint32_t(0xff7fffff);
// Note: VectorBuffer is a special case, since it should go before the initial Load.
// The buffer must be initialized with fill_value before reduction
const auto vector_buffer = push_node(std::make_shared<op::VectorBuffer>());
const auto initial_fill = push_node(std::make_shared<op::Fill>(vector_buffer.second, 0, fill_value));

// Reduce loop
const auto fill = push_node(std::make_shared<op::Fill>(reduce->get_input_source_output(0), m_vector_size, fill_value));
const auto max = push_node(std::make_shared<ov::op::v1::Maximum>(fill.second, initial_fill.second));

const auto reduce_loop_id = loop_manager->mark_loop(
fill.first,
expr_it,
work_amount,
m_vector_size,
0,
std::vector<ExpressionPort>{(*fill.first)->get_input_port(0), (*max.first)->get_input_port(1)},
std::vector<ExpressionPort>{(*max.first)->get_output_port(0)});
const auto reduce_loop_info = loop_manager->get_loop_info(reduce_loop_id);
const auto tail_size = work_amount % m_vector_size;
if (tail_size != 0) {
reduce_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<DefaultTailLoopHandler>(tail_size);
reduce_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
if (work_amount > m_vector_size) {
reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
}
}

const auto horizon = push_node(std::make_shared<op::HorizonMax>(max.second));

// Transfer original ExpressionPorts
linear_ir.replace_input((*fill.first)->get_input_port(0), reduce_expr->get_input_port_connector(0));
linear_ir.replace_input(reduce_expr->get_output_port_connector(0)->get_consumers(), (*horizon.first)->get_output_port_connector(0));

// Update Loop info for outer loops
const std::vector<ExpressionPort> entry_points{(*fill.first)->get_input_port(0)};
const std::vector<ExpressionPort> exit_points{(*horizon.first)->get_output_port(0)};
for (auto loop_id : reduce_expr->get_loop_ids()) {
loop_manager->expression_replacement(vector_buffer.first,
expr_it,
reduce_expr,
loop_id,
entry_points,
exit_points);
}

expr_it = linear_ir.erase(expr_it);
modified = true;
}
return modified;
}

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
108 changes: 108 additions & 0 deletions src/common/snippets/src/lowered/pass/reduce_sum_decomposition.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "snippets/lowered/pass/reduce_sum_decomposition.hpp"

#include "snippets/lowered/linear_ir.hpp"
#include "snippets/lowered/loop_manager.hpp"
#include "snippets/lowered/pass/mark_loops.hpp"
#include "snippets/lowered/pass/iter_handler.hpp"
#include "snippets/snippets_isa.hpp"
#include "snippets/itt.hpp"

#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "openvino/pass/pattern/matcher.hpp"


namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

using LoopInfo = LinearIR::LoopManager::LoopInfo;

ReduceSumDecomposition::ReduceSumDecomposition(size_t vector_size) : m_vector_size{vector_size} {}

bool ReduceSumDecomposition::run(LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::ReduceSumDecompositionLowered")
bool modified = false;
const auto& loop_manager = linear_ir.get_loop_manager();

for (auto expr_it = linear_ir.begin(); expr_it != linear_ir.end(); expr_it++) {
const auto& op = (*expr_it)->get_node();
if (!ov::is_type<ov::snippets::op::ReduceSum>(op))
continue;

const auto reduce = op;
const auto reduce_expr = *expr_it;
const auto& input_shape = reduce_expr->get_input_port_descriptor(0)->get_shape();
const auto work_amount = *(input_shape.rbegin());
const bool is_dynamic = reduce->is_dynamic();

// We need an iterator to the inserted element
auto push_node = [&](const std::shared_ptr<Node>& n) {
const auto expr = linear_ir.insert(expr_it, n);
if (is_dynamic)
expr->get()->updateShapes();
return std::make_pair(expr, n);
};
// Float constant values in byte representation
const auto fill_value = uint32_t(0x00000000);
// Note: VectorBuffer is a special case, since it should go before the initial Load.
// The buffer must be initialized with fill_value before reduction
const auto vector_buffer = push_node(std::make_shared<op::VectorBuffer>());
const auto initial_fill = push_node(std::make_shared<op::Fill>(vector_buffer.second, 0, fill_value));

// Reduce loop
const auto fill = push_node(std::make_shared<op::Fill>(reduce->get_input_source_output(0), m_vector_size, fill_value));
const auto add = push_node(std::make_shared<ov::op::v1::Add>(fill.second, initial_fill.second));

const auto reduce_loop_id = loop_manager->mark_loop(
fill.first,
expr_it,
work_amount,
m_vector_size,
0,
std::vector<ExpressionPort>{(*fill.first)->get_input_port(0), (*add.first)->get_input_port(1)},
std::vector<ExpressionPort>{(*add.first)->get_output_port(0)});
const auto reduce_loop_info = loop_manager->get_loop_info(reduce_loop_id);
const auto tail_size = work_amount % m_vector_size;
if (tail_size != 0) {
reduce_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<DefaultTailLoopHandler>(tail_size);
reduce_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
if (work_amount > m_vector_size) {
reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
reduce_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
}
}

const auto horizon = push_node(std::make_shared<op::HorizonSum>(add.second));

// Transfer original ExpressionPorts
linear_ir.replace_input((*fill.first)->get_input_port(0), reduce_expr->get_input_port_connector(0));
linear_ir.replace_input(reduce_expr->get_output_port_connector(0)->get_consumers(), (*horizon.first)->get_output_port_connector(0));

// Update Loop info for outer loops
const std::vector<ExpressionPort> entry_points{(*fill.first)->get_input_port(0)};
const std::vector<ExpressionPort> exit_points{(*horizon.first)->get_output_port(0)};
for (auto loop_id : reduce_expr->get_loop_ids()) {
loop_manager->expression_replacement(vector_buffer.first,
expr_it,
reduce_expr,
loop_id,
entry_points,
exit_points);
}

expr_it = linear_ir.erase(expr_it);
modified = true;
}

return modified;
}

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
6 changes: 5 additions & 1 deletion src/common/snippets/src/op/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
#include "snippets/lowered/pass/validate_loops.hpp"
#include "snippets/lowered/pass/insert_loops.hpp"
#include "snippets/lowered/pass/optimize_domain.hpp"
#include "snippets/lowered/pass/reduce_max_decomposition.hpp"
#include "snippets/lowered/pass/reduce_sum_decomposition.hpp"

#include "transformations/utils/utils.hpp"

Expand Down Expand Up @@ -436,7 +438,9 @@ void Subgraph::control_flow_transformations(lowered::LinearIR& linear_ir,

PassPipeline pipeline;
pipeline.register_pass<lowered::pass::MarkLoops>(vector_size);
pipeline.register_pass<lowered::pass::SoftmaxDecomposition>(vector_size);
// pipeline.register_pass<lowered::pass::SoftmaxDecomposition>(vector_size);
pipeline.register_pass<lowered::pass::ReduceMaxDecomposition>(vector_size);
pipeline.register_pass<lowered::pass::ReduceSumDecomposition>(vector_size);
pipeline.register_pass<lowered::pass::FuseLoops>();
pipeline.register_pass<lowered::pass::SplitLoops>();
pipeline.register_pass<lowered::pass::MoveResultOutOfLoop>();
Expand Down
4 changes: 3 additions & 1 deletion src/common/snippets/src/pass/softmax_decomposition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,11 @@ SoftmaxDecomposition::SoftmaxDecomposition() {
subtensor[i] = PortDescriptor::ServiceDimensions::FULL_DIM;

PortDescriptorUtils::set_port_descriptor_ptr(reduce_max->input(0), std::make_shared<PortDescriptor>(reduce_max->input(0), subtensor));
PortDescriptorUtils::set_port_descriptor_ptr(reduce_sum->input(0), std::make_shared<PortDescriptor>(reduce_sum->input(0), subtensor));
PortDescriptorUtils::set_port_descriptor_ptr(reduce_max->output(0), std::make_shared<PortDescriptor>(reduce_max->output(0), subtensor));
PortDescriptorUtils::set_port_descriptor_ptr(reduce_sum->input(0), std::make_shared<PortDescriptor>(reduce_sum->input(0), subtensor));
PortDescriptorUtils::set_port_descriptor_ptr(reduce_sum->output(0), std::make_shared<PortDescriptor>(reduce_sum->output(0), subtensor));
PortDescriptorUtils::set_port_descriptor_ptr(power->input(0), std::make_shared<PortDescriptor>(power->input(0), subtensor));
PortDescriptorUtils::set_port_descriptor_ptr(power->output(0), std::make_shared<PortDescriptor>(power->output(0), subtensor));

return ov::replace_node_update_name(softmax, multiply);
};
Expand Down

0 comments on commit f06ddd0

Please sign in to comment.