Skip to content

Commit

Permalink
Subtensor propagation reused in loop specific iterations handler
Browse files Browse the repository at this point in the history
  • Loading branch information
v-Golubev committed Dec 6, 2023
1 parent ac308cf commit 3a4c7fb
Show file tree
Hide file tree
Showing 13 changed files with 269 additions and 170 deletions.
21 changes: 17 additions & 4 deletions src/common/snippets/include/snippets/lowered/loop_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,16 +107,22 @@ class LinearIR::LoopManager {
size_t mark_loop(LinearIR::constExprIt loop_begin_pos,
LinearIR::constExprIt loop_end_pos,
size_t work_amount,
size_t work_amount_increment,
size_t increment,
size_t dim_idx,
const std::vector<T>& entries,
const std::vector<T>& exits) {
const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, work_amount_increment, entries, exits);
const std::vector<T>& exits,
bool set_default_handlers = true) {
if (increment > work_amount)
increment = work_amount;
const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, increment, entries, exits);
loop_info->set_dim_idx(dim_idx);
const auto loop_id = this->add_loop_info(loop_info);
for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) {
insert_loop_id(*expr_it, loop_id);
}
if (set_default_handlers) {
set_default_loop_handlers(loop_info);
}
return loop_id;
}

Expand All @@ -126,12 +132,18 @@ class LinearIR::LoopManager {
size_t work_amount,
size_t increment,
const std::vector<T>& entries,
const std::vector<T>& exits) {
const std::vector<T>& exits,
bool set_default_handlers = true) {
if (increment > work_amount)
increment = work_amount;
const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, increment, entries, exits);
const auto loop_id = this->add_loop_info(loop_info);
for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) {
insert_loop_id(*expr_it, loop_id);
}
if (set_default_handlers) {
set_default_loop_handlers(loop_info);
}
return loop_id;
}

Expand Down Expand Up @@ -186,6 +198,7 @@ class LinearIR::LoopManager {
size_t loop_id, bool loop_ops_inserted = false);

LoopPort get_loop_port_by_expr_port(const ExpressionPort& expr_port, const size_t loop_id);
static void set_default_loop_handlers(const LoopInfoPtr& loop_info);

private:
static void get_io_loop_ports(LinearIR::constExprIt loop_begin_pos,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ namespace snippets {
namespace lowered {
namespace pass {

void register_default_tail_handlers(lowered::pass::SubgraphPassPipeline& pipeline, size_t tail_size);

class SetSingleIterationWithWorkAmount : public pass::SubgraphPass {
public:
SetSingleIterationWithWorkAmount(size_t work_amount);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "snippets/lowered/linear_ir.hpp"
#include "snippets/lowered/pass/pass.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

class UpdateSubtensors : public pass::SubgraphPass {
public:
UpdateSubtensors(size_t tail_size);
OPENVINO_RTTI("UpdateSubtensors", "Pass")
bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;

private:
size_t m_tail_size;
};

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov
21 changes: 13 additions & 8 deletions src/common/snippets/src/lowered/loop_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "snippets/lowered/expression.hpp"
#include "snippets/lowered/pass/iter_handler.hpp"
#include "snippets/lowered/pass/propagate_subtensors.hpp"
#include "snippets/utils.hpp"

#include "openvino/core/graph_util.hpp"
Expand Down Expand Up @@ -260,6 +261,17 @@ LinearIR::LoopManager::LoopPort LinearIR::LoopManager::get_loop_port_by_expr_por
: get_loop_port(loop_info->get_exit_points());
}

void LinearIR::LoopManager::set_default_loop_handlers(const LoopInfoPtr& loop_info) {
const auto tail_size = loop_info->get_work_amount() % loop_info->get_increment();
if (tail_size != 0) {
loop_info->handlers[LoopInfo::LAST_ITER].register_pass<lowered::pass::SetSingleIterationWithWorkAmount>(tail_size);
loop_info->handlers[LoopInfo::LAST_ITER].register_pass<lowered::pass::UpdateMemoryAccessOps>(tail_size);
loop_info->handlers[LoopInfo::LAST_ITER].register_pass<lowered::pass::UpdateSubtensors>(tail_size);
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ReduceWorkAmount>(tail_size);
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ZeroFinalizationOffsets>();
}
}

void LinearIR::LoopManager::get_io_loop_ports(LinearIR::constExprIt loop_begin_pos,
LinearIR::constExprIt loop_end_pos,
std::vector<ExpressionPort> &entries,
Expand Down Expand Up @@ -350,16 +362,9 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,

OPENVINO_ASSERT(dim_idx < loop_tensor.size(), "Incorrect indexes of Loop for markup");
const auto work_amount = *(loop_tensor.rbegin() + dim_idx);
const auto increment = subtensor_value <= work_amount ? subtensor_value : work_amount;
const auto increment = subtensor_value;
const auto id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, dim_idx, loop_entry_points, loop_exit_points);
const auto loop_info = get_loop_info(id);

const auto tail_size = work_amount % increment;
if (tail_size != 0) {
lowered::pass::register_default_tail_handlers(loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ReduceWorkAmount>(tail_size);
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ZeroFinalizationOffsets>();
}
}
}

Expand Down
9 changes: 2 additions & 7 deletions src/common/snippets/src/lowered/pass/iter_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,18 @@

#include "snippets/lowered/pass/iter_handler.hpp"

#include "snippets/itt.hpp"
#include "snippets/lowered/linear_ir.hpp"
#include "snippets/lowered/loop_manager.hpp"
#include "snippets/lowered/pass/init_loops.hpp"
#include "snippets/lowered/pass/propagate_subtensors.hpp"
#include "snippets/snippets_isa.hpp"
#include "snippets/utils.hpp"
#include "snippets/itt.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace pass {

void register_default_tail_handlers(lowered::pass::SubgraphPassPipeline& pipeline, size_t tail_size) {
pipeline.register_pass<SetSingleIterationWithWorkAmount>(tail_size);
pipeline.register_pass<UpdateMemoryAccessOps>(tail_size);
}

SetSingleIterationWithWorkAmount::SetSingleIterationWithWorkAmount(size_t work_amount)
: SubgraphPass(),
m_work_amount(work_amount) {}
Expand Down
148 changes: 148 additions & 0 deletions src/common/snippets/src/lowered/pass/propagate_subtensors.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "snippets/lowered/pass/propagate_subtensors.hpp"

#include "snippets/lowered/linear_ir.hpp"
#include "snippets/lowered/loop_manager.hpp"
#include "snippets/snippets_isa.hpp"
#include "snippets/utils.hpp"
#include "snippets/itt.hpp"

namespace ov {
namespace snippets {
namespace lowered {
namespace pass {
namespace {
void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir,
const LinearIR::LoopManager::LoopInfoPtr& loop_info,
LinearIR::container::const_iterator begin,
LinearIR::container::const_iterator end,
const size_t new_dim_value) {
std::map<lowered::PortDescriptorPtr, snippets::VectorDims> original_shapes;
static constexpr size_t existing_subtensor_value = SIZE_MAX;
// First step: set new dim value to the corresponding entry_points' dimensions
if (new_dim_value != existing_subtensor_value) {
for (const auto& port : loop_info->get_entry_points()) {
if (port.is_incremented) {
const auto& expr = port.expr_port->get_expr();
const auto node = expr->get_node();
auto desc = port.expr_port->get_descriptor_ptr();
auto subtensor = desc->get_subtensor();
if (port.dim_idx < subtensor.size()) {
*(subtensor.rbegin() + port.dim_idx) = new_dim_value;
desc->set_subtensor(subtensor);
}

const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr();
const auto& layout = parent_desc->get_layout();
const auto& shape = parent_desc->get_shape();
if (original_shapes.find(parent_desc) == original_shapes.end()) {
original_shapes[parent_desc] = shape;
}
auto new_shape = shape;
new_shape[*(layout.rbegin() + port.dim_idx)] = new_dim_value;
parent_desc->set_shape(new_shape);
}
}
}

auto update_only_dim_idx_with_subtensor_value = [&](const LinearIR::LoopManager::LoopPort& port) {
if (port.is_incremented) {
auto desc = port.expr_port->get_descriptor_ptr();
const auto expr = port.expr_port->get_expr();
const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr();

const auto& layout = parent_desc->get_layout();
const auto& shape = parent_desc->get_shape();
const auto& desc_subtensor = desc->get_subtensor();
if (port.dim_idx < desc_subtensor.size()) {
if (original_shapes.find(parent_desc) == original_shapes.end()) {
original_shapes[parent_desc] = shape;
}
auto new_shape = shape;
new_shape[*(layout.rbegin() + port.dim_idx)] = *(desc_subtensor.rbegin() + port.dim_idx);
parent_desc->set_shape(new_shape);
}
}
};

auto update_subtensors = [](const std::vector<PortDescriptorPtr>& descs, bool is_input) {
for (const auto& desc : descs) {
const auto& subtensor = desc->get_subtensor();
if (!subtensor.empty()) {
auto planar_dims = is_input ? snippets::utils::get_planar_vdims(desc->get_shape(), desc->get_layout())
: snippets::utils::get_preordered_vdims(desc->get_shape(), desc->get_layout());
const size_t subtensor_start = planar_dims.size() - subtensor.size();
VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end());
for (size_t i = 0; i < new_subtensor.size(); ++i) {
new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]);
}
desc->set_subtensor(new_subtensor);
}
}
};

auto shape_inference_end_it = end;
const bool loop_by_last_dim = loop_info->get_dim_idx() == 0;
// Subtensors are updated using shape inference infrastructure:
// For inner loops propagation function is called recursively
for (auto expr_it = begin; expr_it != end; expr_it++) {
const auto expr = *expr_it;
if (ov::is_type<snippets::op::LoopEnd>(expr->get_node()))
continue;
if (auto loop_begin = ov::as_type_ptr<snippets::op::LoopBegin>(expr->get_node())) {
const auto loop_end = loop_begin->get_loop_end();
const auto inner_loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id());
const auto inner_begin = std::next(expr_it);
const auto inner_end = linear_ir.find(linear_ir.get_expr_by_node(loop_end));

// The corresponding shapes of inner loops entry points must be updated using existing subtensor values
if (new_dim_value == existing_subtensor_value) {
for (const auto& port : loop_info->get_entry_points())
update_only_dim_idx_with_subtensor_value(port);
}
propagate_updated_subtensor_through_loop(linear_ir, inner_loop_info, inner_begin, inner_end, existing_subtensor_value);
expr_it = inner_end;
continue;
}
if ((ov::is_type<snippets::op::BroadcastMove>(expr_it->get()->get_node()) ||
ov::is_type<snippets::op::BroadcastLoad>(expr_it->get()->get_node())) &&
loop_by_last_dim) {
// WA: we have to break subtensor propagation if we try to propagate new last dim through Broadcast nodes
// which broadcast last dim in original dimension value anyway
// This workaround might be avoided if blocked shape are used for tail size propagation
shape_inference_end_it = expr_it;
break;
}
expr->updateShapes();
update_subtensors(expr->get_input_port_descriptors(), true);
update_subtensors(expr->get_output_port_descriptors(), false);
}

// After subtensor propagation, the original shapes must be restored
for (const auto& elem : original_shapes)
elem.first->set_shape(elem.second);
for (auto expr_it = begin; expr_it != shape_inference_end_it; expr_it++)
(*expr_it)->updateShapes();
}
} // namespace

UpdateSubtensors::UpdateSubtensors(size_t tail_size) : SubgraphPass(), m_tail_size(tail_size) {}

bool UpdateSubtensors::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
const auto& expr = *end;
const auto node = expr->get_node();
const auto loop_end = ov::as_type_ptr<op::LoopEnd>(node);
const auto& loop_manager = linear_ir.get_loop_manager();
const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id());
propagate_updated_subtensor_through_loop(linear_ir, loop_info, std::next(begin), end, m_tail_size);
return true;
}

} // namespace pass
} // namespace lowered
} // namespace snippets
} // namespace ov

3 changes: 0 additions & 3 deletions src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
const auto loop_info = loop_manager->get_loop_info(reduce_loop_id);
const auto tail_size = work_amount % increment;
if (tail_size != 0) {
lowered::pass::register_default_tail_handlers(loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
}
const auto horizon = push_node(get_horizon_node(accumulation.second, reduce_type_info));

Expand Down
25 changes: 3 additions & 22 deletions src/common/snippets/src/lowered/pass/softmax_decomposition.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,12 +75,7 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
const auto& reduce_max_loop_info = loop_manager->get_loop_info(reduce_max_loop_id);
const auto tail_size = inner_work_amount % m_vector_size;
if (tail_size != 0) {
lowered::pass::register_default_tail_handlers(reduce_max_loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
reduce_max_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
if (inner_work_amount > m_vector_size) {
reduce_max_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
reduce_max_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
}
}
const auto broadcast_horizon_max = push_node(std::make_shared<op::BroadcastMove>(horizon_max.second, broadcasted_dim));
const auto vector_buffer_sum = push_node(std::make_shared<op::VectorBuffer>());
Expand All @@ -104,12 +99,7 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
(*sum.first)->get_output_port(0)});
const auto& reduce_sum_loop_info = loop_manager->get_loop_info(reduce_sum_loop_id);
if (tail_size != 0) {
lowered::pass::register_default_tail_handlers(reduce_sum_loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
reduce_sum_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
if (inner_work_amount > m_vector_size) {
reduce_sum_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
reduce_sum_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
}
}

// Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop
Expand All @@ -125,18 +115,9 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
linear_ir.replace_input(output_connector->get_consumers(), (*mul.first)->get_output_port_connector(0));

// Markup of Mul Loop
const auto mul_loop_id = loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, m_vector_size, 0,
std::vector<ExpressionPort>{(*mul.first)->get_input_port(0),
(*mul.first)->get_input_port(1)},
std::vector<ExpressionPort>{(*mul.first)->get_output_port(0)});
const auto& mul_loop_info = loop_manager->get_loop_info(mul_loop_id);
if (tail_size != 0) {
lowered::pass::register_default_tail_handlers(mul_loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
if (inner_work_amount > m_vector_size) {
mul_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
mul_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
}
}
loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, m_vector_size, 0,
std::vector<ExpressionPort>{(*mul.first)->get_input_port(0), (*mul.first)->get_input_port(1)},
std::vector<ExpressionPort>{(*mul.first)->get_output_port(0)});

// Update Loop info for outer loops
const auto entry_points = std::vector<ExpressionPort>{(*fill_max_tail.first)->get_input_port(0),
Expand Down
5 changes: 0 additions & 5 deletions src/common/snippets/src/lowered/pass/split_loops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,7 @@ bool SplitLoops::run(LinearIR& linear_ir) {
// Need to skip this transformation for such cases or improve the logic
if (tail_size != 0) {
// TODO: should we remove previous tail loop handler?
register_default_tail_handlers(new_loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
new_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<TransformInnerSplitLoop>(tail_size);
if (work_amount > increment) {
new_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
new_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
}
}
break;
}
Expand Down
Loading

0 comments on commit 3a4c7fb

Please sign in to comment.