Subtensor propagation reused in loop specific iterations handler

v-Golubev · Dec 6, 2023 · 3a4c7fb · 3a4c7fb
1 parent ac308cf
commit 3a4c7fb
Show file tree

Hide file tree

Showing 13 changed files with 269 additions and 170 deletions.
diff --git a/src/common/snippets/include/snippets/lowered/loop_manager.hpp b/src/common/snippets/include/snippets/lowered/loop_manager.hpp
@@ -107,16 +107,22 @@ class LinearIR::LoopManager {
     size_t mark_loop(LinearIR::constExprIt loop_begin_pos,
                      LinearIR::constExprIt loop_end_pos,
                      size_t work_amount,
-                     size_t work_amount_increment,
+                     size_t increment,
                      size_t dim_idx,
                      const std::vector<T>& entries,
-                     const std::vector<T>& exits) {
-        const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, work_amount_increment, entries, exits);
+                     const std::vector<T>& exits,
+                     bool set_default_handlers = true) {
+        if (increment > work_amount)
+            increment = work_amount;
+        const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, increment, entries, exits);
         loop_info->set_dim_idx(dim_idx);
         const auto loop_id = this->add_loop_info(loop_info);
         for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) {
             insert_loop_id(*expr_it, loop_id);
         }
+        if (set_default_handlers) {
+            set_default_loop_handlers(loop_info);
+        }
         return loop_id;
     }
 
@@ -126,12 +132,18 @@ class LinearIR::LoopManager {
                      size_t work_amount,
                      size_t increment,
                      const std::vector<T>& entries,
-                     const std::vector<T>& exits) {
+                     const std::vector<T>& exits,
+                     bool set_default_handlers = true) {
+        if (increment > work_amount)
+            increment = work_amount;
         const auto loop_info = std::make_shared<LoopManager::LoopInfo>(work_amount, increment, entries, exits);
         const auto loop_id = this->add_loop_info(loop_info);
         for (auto expr_it = loop_begin_pos; expr_it != loop_end_pos; ++expr_it) {
             insert_loop_id(*expr_it, loop_id);
         }
+        if (set_default_handlers) {
+            set_default_loop_handlers(loop_info);
+        }
         return loop_id;
     }
 
@@ -186,6 +198,7 @@ class LinearIR::LoopManager {
                                 size_t loop_id, bool loop_ops_inserted = false);
 
     LoopPort get_loop_port_by_expr_port(const ExpressionPort& expr_port, const size_t loop_id);
+    static void set_default_loop_handlers(const LoopInfoPtr& loop_info);
 
 private:
     static void get_io_loop_ports(LinearIR::constExprIt loop_begin_pos,

diff --git a/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp b/src/common/snippets/include/snippets/lowered/pass/iter_handler.hpp
@@ -14,8 +14,6 @@ namespace snippets {
 namespace lowered {
 namespace pass {
 
-void register_default_tail_handlers(lowered::pass::SubgraphPassPipeline& pipeline, size_t tail_size);
-
 class SetSingleIterationWithWorkAmount : public pass::SubgraphPass {
 public:
     SetSingleIterationWithWorkAmount(size_t work_amount);

diff --git a/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp b/src/common/snippets/include/snippets/lowered/pass/propagate_subtensors.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/pass/pass.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+
+class UpdateSubtensors : public pass::SubgraphPass {
+public:
+    UpdateSubtensors(size_t tail_size);
+    OPENVINO_RTTI("UpdateSubtensors", "Pass")
+    bool run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) override;
+
+private:
+    size_t m_tail_size;
+};
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
diff --git a/src/common/snippets/src/lowered/loop_manager.cpp b/src/common/snippets/src/lowered/loop_manager.cpp
@@ -6,6 +6,7 @@
 
 #include "snippets/lowered/expression.hpp"
 #include "snippets/lowered/pass/iter_handler.hpp"
+#include "snippets/lowered/pass/propagate_subtensors.hpp"
 #include "snippets/utils.hpp"
 
 #include "openvino/core/graph_util.hpp"
@@ -260,6 +261,17 @@ LinearIR::LoopManager::LoopPort LinearIR::LoopManager::get_loop_port_by_expr_por
                                                          : get_loop_port(loop_info->get_exit_points());
 }
 
+void LinearIR::LoopManager::set_default_loop_handlers(const LoopInfoPtr& loop_info) {
+    const auto tail_size = loop_info->get_work_amount() % loop_info->get_increment();
+    if (tail_size != 0) {
+        loop_info->handlers[LoopInfo::LAST_ITER].register_pass<lowered::pass::SetSingleIterationWithWorkAmount>(tail_size);
+        loop_info->handlers[LoopInfo::LAST_ITER].register_pass<lowered::pass::UpdateMemoryAccessOps>(tail_size);
+        loop_info->handlers[LoopInfo::LAST_ITER].register_pass<lowered::pass::UpdateSubtensors>(tail_size);
+        loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ReduceWorkAmount>(tail_size);
+        loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ZeroFinalizationOffsets>();
+    }
+}
+
 void LinearIR::LoopManager::get_io_loop_ports(LinearIR::constExprIt loop_begin_pos,
                                               LinearIR::constExprIt loop_end_pos,
                                               std::vector<ExpressionPort> &entries,
@@ -350,16 +362,9 @@ void LinearIR::LoopManager::mark_loop(LinearIR::constExprIt loop_begin_pos,
 
         OPENVINO_ASSERT(dim_idx < loop_tensor.size(), "Incorrect indexes of Loop for markup");
         const auto work_amount = *(loop_tensor.rbegin() + dim_idx);
-        const auto increment = subtensor_value <= work_amount ? subtensor_value : work_amount;
+        const auto increment = subtensor_value;
         const auto id = mark_loop(loop_begin_pos, loop_end_pos, work_amount, increment, dim_idx, loop_entry_points, loop_exit_points);
         const auto loop_info = get_loop_info(id);
-
-        const auto tail_size = work_amount % increment;
-        if (tail_size != 0) {
-            lowered::pass::register_default_tail_handlers(loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
-            loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ReduceWorkAmount>(tail_size);
-            loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<lowered::pass::ZeroFinalizationOffsets>();
-        }
     }
 }
 

diff --git a/src/common/snippets/src/lowered/pass/iter_handler.cpp b/src/common/snippets/src/lowered/pass/iter_handler.cpp
@@ -4,23 +4,18 @@
 
 #include "snippets/lowered/pass/iter_handler.hpp"
 
+#include "snippets/itt.hpp"
 #include "snippets/lowered/linear_ir.hpp"
 #include "snippets/lowered/loop_manager.hpp"
-#include "snippets/lowered/pass/init_loops.hpp"
+#include "snippets/lowered/pass/propagate_subtensors.hpp"
 #include "snippets/snippets_isa.hpp"
 #include "snippets/utils.hpp"
-#include "snippets/itt.hpp"
 
 namespace ov {
 namespace snippets {
 namespace lowered {
 namespace pass {
 
-void register_default_tail_handlers(lowered::pass::SubgraphPassPipeline& pipeline, size_t tail_size) {
-    pipeline.register_pass<SetSingleIterationWithWorkAmount>(tail_size);
-    pipeline.register_pass<UpdateMemoryAccessOps>(tail_size);
-}
-
 SetSingleIterationWithWorkAmount::SetSingleIterationWithWorkAmount(size_t work_amount)
     : SubgraphPass(),
       m_work_amount(work_amount) {}

diff --git a/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp b/src/common/snippets/src/lowered/pass/propagate_subtensors.cpp
@@ -0,0 +1,148 @@
+// Copyright (C) 2023 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/lowered/pass/propagate_subtensors.hpp"
+
+#include "snippets/lowered/linear_ir.hpp"
+#include "snippets/lowered/loop_manager.hpp"
+#include "snippets/snippets_isa.hpp"
+#include "snippets/utils.hpp"
+#include "snippets/itt.hpp"
+
+namespace ov {
+namespace snippets {
+namespace lowered {
+namespace pass {
+namespace {
+void propagate_updated_subtensor_through_loop(const LinearIR& linear_ir,
+                                              const LinearIR::LoopManager::LoopInfoPtr& loop_info,
+                                              LinearIR::container::const_iterator begin,
+                                              LinearIR::container::const_iterator end,
+                                              const size_t new_dim_value) {
+    std::map<lowered::PortDescriptorPtr, snippets::VectorDims> original_shapes;
+    static constexpr size_t existing_subtensor_value = SIZE_MAX;
+    // First step: set new dim value to the corresponding entry_points' dimensions
+    if (new_dim_value != existing_subtensor_value) {
+        for (const auto& port : loop_info->get_entry_points()) {
+            if (port.is_incremented) {
+                const auto& expr = port.expr_port->get_expr();
+                const auto node = expr->get_node();
+                auto desc = port.expr_port->get_descriptor_ptr();
+                auto subtensor = desc->get_subtensor();
+                if (port.dim_idx < subtensor.size()) {
+                    *(subtensor.rbegin() + port.dim_idx) = new_dim_value;
+                    desc->set_subtensor(subtensor);
+                }
+
+                const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr();
+                const auto& layout = parent_desc->get_layout();
+                const auto& shape = parent_desc->get_shape();
+                if (original_shapes.find(parent_desc) == original_shapes.end()) {
+                    original_shapes[parent_desc] = shape;
+                }
+                auto new_shape = shape;
+                new_shape[*(layout.rbegin() + port.dim_idx)] = new_dim_value;
+                parent_desc->set_shape(new_shape);
+            }
+        }
+    }
+
+    auto update_only_dim_idx_with_subtensor_value = [&](const LinearIR::LoopManager::LoopPort& port) {
+        if (port.is_incremented) {
+            auto desc = port.expr_port->get_descriptor_ptr();
+            const auto expr = port.expr_port->get_expr();
+            const auto parent_desc = expr->get_input_port_connector(port.expr_port->get_index())->get_source().get_descriptor_ptr();
+
+            const auto& layout = parent_desc->get_layout();
+            const auto& shape = parent_desc->get_shape();
+            const auto& desc_subtensor = desc->get_subtensor();
+            if (port.dim_idx < desc_subtensor.size()) {
+                if (original_shapes.find(parent_desc) == original_shapes.end()) {
+                    original_shapes[parent_desc] = shape;
+                }
+                auto new_shape = shape;
+                new_shape[*(layout.rbegin() + port.dim_idx)] = *(desc_subtensor.rbegin() + port.dim_idx);
+                parent_desc->set_shape(new_shape);
+            }
+        }
+    };
+
+    auto update_subtensors = [](const std::vector<PortDescriptorPtr>& descs, bool is_input) {
+        for (const auto& desc : descs) {
+            const auto& subtensor = desc->get_subtensor();
+            if (!subtensor.empty()) {
+                auto planar_dims = is_input ? snippets::utils::get_planar_vdims(desc->get_shape(), desc->get_layout())
+                                            : snippets::utils::get_preordered_vdims(desc->get_shape(), desc->get_layout());
+                const size_t subtensor_start = planar_dims.size() - subtensor.size();
+                VectorDims new_subtensor(planar_dims.begin() + subtensor_start, planar_dims.end());
+                for (size_t i = 0; i < new_subtensor.size(); ++i) {
+                    new_subtensor[i] = std::min(new_subtensor[i], subtensor[i]);
+                }
+                desc->set_subtensor(new_subtensor);
+            }
+        }
+    };
+
+    auto shape_inference_end_it = end;
+    const bool loop_by_last_dim = loop_info->get_dim_idx() == 0;
+    // Subtensors are updated using shape inference infrastructure:
+    // For inner loops propagation function is called recursively
+    for (auto expr_it = begin; expr_it != end; expr_it++) {
+        const auto expr = *expr_it;
+        if (ov::is_type<snippets::op::LoopEnd>(expr->get_node()))
+            continue;
+        if (auto loop_begin = ov::as_type_ptr<snippets::op::LoopBegin>(expr->get_node())) {
+            const auto loop_end = loop_begin->get_loop_end();
+            const auto inner_loop_info = linear_ir.get_loop_manager()->get_loop_info(loop_end->get_id());
+            const auto inner_begin = std::next(expr_it);
+            const auto inner_end = linear_ir.find(linear_ir.get_expr_by_node(loop_end));
+
+            // The corresponding shapes of inner loops entry points must be updated using existing subtensor values
+            if (new_dim_value == existing_subtensor_value) {
+                for (const auto& port : loop_info->get_entry_points())
+                    update_only_dim_idx_with_subtensor_value(port);
+            }
+            propagate_updated_subtensor_through_loop(linear_ir, inner_loop_info, inner_begin, inner_end, existing_subtensor_value);
+            expr_it = inner_end;
+            continue;
+        }
+        if ((ov::is_type<snippets::op::BroadcastMove>(expr_it->get()->get_node()) ||
+            ov::is_type<snippets::op::BroadcastLoad>(expr_it->get()->get_node())) &&
+            loop_by_last_dim) {
+            // WA: we have to break subtensor propagation if we try to propagate new last dim through Broadcast nodes
+            // which broadcast last dim in original dimension value anyway
+            // This workaround might be avoided if blocked shape are used for tail size propagation
+            shape_inference_end_it = expr_it;
+            break;
+        }
+        expr->updateShapes();
+        update_subtensors(expr->get_input_port_descriptors(), true);
+        update_subtensors(expr->get_output_port_descriptors(), false);
+    }
+
+    // After subtensor propagation, the original shapes must be restored
+    for (const auto& elem : original_shapes)
+        elem.first->set_shape(elem.second);
+    for (auto expr_it = begin; expr_it != shape_inference_end_it; expr_it++)
+        (*expr_it)->updateShapes();
+}
+}  // namespace
+
+UpdateSubtensors::UpdateSubtensors(size_t tail_size) : SubgraphPass(), m_tail_size(tail_size) {}
+
+bool UpdateSubtensors::run(const LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
+    const auto& expr = *end;
+    const auto node = expr->get_node();
+    const auto loop_end = ov::as_type_ptr<op::LoopEnd>(node);
+    const auto& loop_manager = linear_ir.get_loop_manager();
+    const auto& loop_info = loop_manager->get_loop_info(loop_end->get_id());
+    propagate_updated_subtensor_through_loop(linear_ir, loop_info, std::next(begin), end, m_tail_size);
+    return true;
+}
+
+} // namespace pass
+} // namespace lowered
+} // namespace snippets
+} // namespace ov
+
diff --git a/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp b/src/common/snippets/src/lowered/pass/reduce_decomposition.cpp
@@ -102,10 +102,7 @@ bool ReduceDecomposition::run(LinearIR& linear_ir) {
         const auto loop_info = loop_manager->get_loop_info(reduce_loop_id);
         const auto tail_size = work_amount % increment;
         if (tail_size != 0) {
-            lowered::pass::register_default_tail_handlers(loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
             loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
-            loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
-            loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
         }
         const auto horizon = push_node(get_horizon_node(accumulation.second, reduce_type_info));
 

diff --git a/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp b/src/common/snippets/src/lowered/pass/softmax_decomposition.cpp
@@ -75,12 +75,7 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
             const auto& reduce_max_loop_info = loop_manager->get_loop_info(reduce_max_loop_id);
             const auto tail_size = inner_work_amount % m_vector_size;
             if (tail_size != 0) {
-                lowered::pass::register_default_tail_handlers(reduce_max_loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
                 reduce_max_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
-                if (inner_work_amount > m_vector_size) {
-                    reduce_max_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
-                    reduce_max_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
-                }
             }
             const auto broadcast_horizon_max = push_node(std::make_shared<op::BroadcastMove>(horizon_max.second, broadcasted_dim));
             const auto vector_buffer_sum = push_node(std::make_shared<op::VectorBuffer>());
@@ -104,12 +99,7 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
                                                                                                 (*sum.first)->get_output_port(0)});
             const auto& reduce_sum_loop_info = loop_manager->get_loop_info(reduce_sum_loop_id);
             if (tail_size != 0) {
-                lowered::pass::register_default_tail_handlers(reduce_sum_loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
                 reduce_sum_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<SetFillOffset>(tail_size);
-                if (inner_work_amount > m_vector_size) {
-                    reduce_sum_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
-                    reduce_sum_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
-                }
             }
 
             // Divide is expensive operation, so we decompose it into 1 / x * y, where 1 / x is executed outside loop
@@ -125,18 +115,9 @@ bool SoftmaxDecomposition::run(LinearIR& linear_ir) {
             linear_ir.replace_input(output_connector->get_consumers(), (*mul.first)->get_output_port_connector(0));
 
             // Markup of Mul Loop
-            const auto mul_loop_id = loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, m_vector_size, 0,
-                                                             std::vector<ExpressionPort>{(*mul.first)->get_input_port(0),
-                                                                                         (*mul.first)->get_input_port(1)},
-                                                             std::vector<ExpressionPort>{(*mul.first)->get_output_port(0)});
-            const auto& mul_loop_info = loop_manager->get_loop_info(mul_loop_id);
-            if (tail_size != 0) {
-                lowered::pass::register_default_tail_handlers(mul_loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
-                if (inner_work_amount > m_vector_size) {
-                    mul_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
-                    mul_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
-                }
-            }
+            loop_manager->mark_loop(mul.first, expr_it, inner_work_amount, m_vector_size, 0,
+                                    std::vector<ExpressionPort>{(*mul.first)->get_input_port(0), (*mul.first)->get_input_port(1)},
+                                    std::vector<ExpressionPort>{(*mul.first)->get_output_port(0)});
 
             // Update Loop info for outer loops
             const auto entry_points = std::vector<ExpressionPort>{(*fill_max_tail.first)->get_input_port(0),

diff --git a/src/common/snippets/src/lowered/pass/split_loops.cpp b/src/common/snippets/src/lowered/pass/split_loops.cpp
@@ -92,12 +92,7 @@ bool SplitLoops::run(LinearIR& linear_ir) {
                 // Need to skip this transformation for such cases or improve the logic
                 if (tail_size != 0) {
                     // TODO: should we remove previous tail loop handler?
-                    register_default_tail_handlers(new_loop_info->handlers[LoopInfo::LAST_ITER], tail_size);
                     new_loop_info->handlers[LoopInfo::LAST_ITER].register_pass<TransformInnerSplitLoop>(tail_size);
-                    if (work_amount > increment) {
-                        new_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ReduceWorkAmount>(tail_size);
-                        new_loop_info->handlers[LoopInfo::MAIN_BODY].register_pass<ZeroFinalizationOffsets>();
-                    }
                 }
                 break;
             }