From 2699590c2bc4b0458a6165586af4c593c2cd735b Mon Sep 17 00:00:00 2001
From: Vladislav Golubev <vladislav.golubev@intel.com>
Date: Thu, 30 Nov 2023 15:42:51 +0100
Subject: [PATCH] Post rebase fixes

---
 .../src/lowered/pass/identify_buffers.cpp     | 15 ++----
 .../src/lowered/pass/insert_broadcastmove.cpp |  3 +-
 .../src/lowered/pass/insert_buffers.cpp       | 54 ++++++++++++++-----
 .../src/lowered/pass/buffer_allocation.cpp    |  8 +--
 .../lowered/buffer_allocation.cpp             | 14 +++--
 5 files changed, 58 insertions(+), 36 deletions(-)
diff --git a/src/common/snippets/src/lowered/pass/identify_buffers.cpp b/src/common/snippets/src/lowered/pass/identify_buffers.cpp
index 1e1310864fa717..7b19693c7c3b7c 100644
--- a/src/common/snippets/src/lowered/pass/identify_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/identify_buffers.cpp
@@ -74,19 +74,14 @@ void IdentifyBuffers::update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrP
 }
 
 std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool) {
-    // There are several sync points for adjacency check:
-    // 1. Loop because only in Loop we increment pointers. So if some Buffers in the one Loop have conflict
-    //    (cannot be inplace: the different ptr increment and data sizes) they are called as adjacent
-    // 2. Brgemm because its blocking implementation requires Buffers with unique memory on all inputs and outputs
+    // The sync point to check for adjacency is Loop because only in Loop we increment pointers.
+    // So if some Buffers in the one Loop have conflict (cannot be inplace: the different ptr increment and data sizes)
+    // they are called as adjacent
     const auto size = pool.size();
     std::vector<bool> adj(size * size, false);
     for (size_t i = 0; i < size; ++i)
         adj[index(size, i, i)] = true;
 
-    auto is_buffer = [](const ExpressionPort& port) {
-        return ov::is_type<op::Buffer>(port.get_expr()->get_node());
-    };
-
     for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
         const auto &expr = *expr_it;
         const auto& loop_end = ov::as_type_ptr<op::LoopEnd>(expr->get_node());
@@ -207,9 +202,7 @@ auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector<bool>& adj) -> s
 
 bool IdentifyBuffers::run(LinearIR& linear_ir) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::IdentifyBuffers")
-    // Unite Buffers using Graph coloring algorithm.
-    // Notes: We identify only Buffer with Intermediate memory because Buffers with new memory are used only in Brgemm case
-    //        so these Buffers are always IntermediateBuffer nonadjacent
+    // Identify Buffers using Graph coloring algorithm.
     BufferPool buffer_pool;
 
     for (const auto& expr : linear_ir) {
diff --git a/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp b/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp
index a39536d990d41b..723b97b5a25788 100644
--- a/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp
@@ -56,8 +56,7 @@ bool InsertBroadcastMove::run(LinearIR& linear_ir) {
                 OPENVINO_ASSERT(last_dims[i] == 1,
                                 "Attempt to broadcast non-1 dimension. Target dim: ", broadcasted_dim,
                                 " This dim: ", last_dims[i]);
-                const auto bcast_dim = ov::Dimension(last_dims[i]);
-                const auto broadcast = std::make_shared<op::BroadcastMove>(node->get_input_source_output(i), bcast_dim);
+                const auto broadcast = std::make_shared<op::BroadcastMove>(node->get_input_source_output(i), broadcasted_dim);
 
                 PortDescriptorUtils::set_port_descriptor_ptr(broadcast->output(0), connectors[i]->get_source().get_descriptor_ptr()->clone());
                 const auto broadcast_expr = linear_ir.create_expression(broadcast, {connectors[i]});
diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp
index 9b8657217a6c41..81835a4ca390ae 100644
--- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp
+++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp
@@ -49,26 +49,52 @@ ov::Shape compute_allocation_shape(const LinearIR::LoopManagerPtr& loop_manager,
         return allocation_shape;
     }
 
-    auto set_rest_dims_to_ones = [&](const int filled_dims_count) {
-        for (int i = 0; i < static_cast<int>(allocation_shape.size()) - filled_dims_count; ++i) {
-            allocation_shape[i] = 1;
+    // If subtensor is set, its information is used for allocation shape computation. Two situations are possible:
+    // 1. Buffer is outside the parent loop: the corresponding subtensor value is ignored, parent loop work amount is set instead
+    // 2. Buffer is inside the parent loop: the corresponding subtensor value is used in allocation shape.
+    // Since we can defenitely know which subtensor value corresponds to the loop only for 1st case
+    // (we can extract this info from loop exit port), we copy subtensor, and then replace subtensor values with parent loop work amount if needed.
+    // Example:
+    // Parent subtensor: [M_blk, N_blk]
+    // Buffer loop idces: [M_loop_idx], parent loop idces: [M_loop_idx, N_loop_idx]
+    //
+    // 1. Allocation shape is set to subtensor: [M_blk, N_blk]
+    // 2. Buffer is inside M_loop_idx loop => allocation shape is not changed
+    // 3. Buffer is outside N_loop_idx loop => the corresponding allocation shape value is replaced with N loop work amount
+    // So the result allocation shape is [M_blk, N_loop_work_amount]
+    const auto& subtensor =  expr_port.get_descriptor_ptr()->get_subtensor();
+    if (!subtensor.empty()) {
+        for (size_t i = 0; i < std::min(rank, subtensor.size()); ++i) {
+            auto& cur_val = *(allocation_shape.rbegin() + i);
+            const auto& subtensor_val = *(subtensor.rbegin() + i);
+            cur_val = std::min(cur_val, subtensor_val);
+        }
+        for (const auto& parent_loop : parent_loop_ids) {
+            if (std::find(buffer_loop_ids.begin(), buffer_loop_ids.end(), parent_loop) == buffer_loop_ids.end()) {
+                const auto loop_info = loop_manager->get_loop_info(parent_loop);
+                const auto& exit_points = loop_info->get_exit_points();
+                auto it = std::find_if(exit_points.begin(),
+                                       exit_points.end(),
+                                       [&expr_port](const LinearIR::LoopManager::LoopPort& port) {
+                                           return *port.expr_port == expr_port;
+                                       });
+                OPENVINO_ASSERT(it != exit_points.end(), "compute_allocation_shape: exit point of parent loop can not be found");
+                const auto& loop_port = *it;
+                if (loop_port.is_incremented && loop_port.dim_idx < allocation_shape.size()) {
+                    *(allocation_shape.rbegin() + loop_port.dim_idx) = loop_info->get_work_amount();
+                }
+            }
         }
-    };
-
-    // In some cases it's possible to allocate less shape
-    // 1. Buffer and its parent are in the same loop: allocation size for the outer dimension can be extracted from loop increment
-    // 2. Buffer is outside the parent's loops: allocation size can be extracted from the corresponding loop work amount
-    // TODO: Use general logic with the help of memory counts for allocation shape computation
-    if (buffer_loop_ids.back() == parent_loop_ids.back()) {
-        const auto buffer_loop = loop_manager->get_loop_info(buffer_loop_ids.back());
-        *(allocation_shape.rbegin() + 1) = buffer_loop->get_increment();
-        set_rest_dims_to_ones(2);
     } else {
+        // WA: In case of empty subtensors another information have to be used to update allocation shape.
         for (size_t i = 0; i < std::min(rank, parent_loop_ids.size()); ++i) {
             const auto loop = loop_manager->get_loop_info(*(parent_loop_ids.rbegin() + i));
+            OPENVINO_ASSERT(loop->get_dim_idx() == i, "compute_allocation_shape: eltwise loop has unexpected dimension index");
             *(allocation_shape.rbegin() + i) = loop->get_work_amount();
         }
-        set_rest_dims_to_ones(static_cast<int>(parent_loop_ids.size()));
+        for (int i = 0; i < allocation_rank - static_cast<int>(parent_loop_ids.size()); ++i) {
+            allocation_shape[i] = 1;
+        }
     }
     return allocation_shape;
 }
diff --git a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
index 4dc6ac8d365208..fe887cecd96f17 100644
--- a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
+++ b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp
@@ -92,9 +92,9 @@ std::shared_ptr<ov::Model> EltwiseBufferAllocationTest::GetModel() const {
     const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
     const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
     const auto add = std::make_shared<ov::op::v1::Add>(parameter0, parameter1);
-    const auto buffer0 = std::make_shared<ov::snippets::op::Buffer>(add, static_cast<int32_t>(subtensor_buffer.size()));
+    const auto buffer0 = std::make_shared<ov::snippets::op::IntermediateMemoryBuffer>(add, static_cast<int32_t>(subtensor_buffer.size()));
     const auto relu = std::make_shared<ov::op::v0::Relu>(buffer0);
-    const auto buffer1 = std::make_shared<ov::snippets::op::Buffer>(relu, static_cast<int32_t>(subtensor_buffer.size()));
+    const auto buffer1 = std::make_shared<ov::snippets::op::IntermediateMemoryBuffer>(relu, static_cast<int32_t>(subtensor_buffer.size()));
     const auto exp = std::make_shared<ov::op::v0::Exp>(buffer1);
     const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(exp), ov::ParameterVector{parameter0, parameter1});
 
@@ -119,7 +119,7 @@ void MHABufferAllocationTest::MarkBrgemm(const std::shared_ptr<ov::snippets::op:
 }
 
 std::shared_ptr<ov::Model> MHABufferAllocationTest::GetModel() const {
-    const auto subtensor_scalar = std::vector<size_t>{1, 1};
+    const auto subtensor_scalar = std::vector<size_t>{1};
     const auto subtensor_eltwise = std::vector<size_t>{1, m_vector_size};
     const auto subtensor_brgemm = std::vector<size_t>{32, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
     const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
@@ -187,7 +187,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHA
                                  ::testing::Values(true),
                                  ::testing::Values(true),
                                  ::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
-                                 ::testing::Values(3)), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
+                                 ::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
                          BufferAllocationTest::getTestCaseName);
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABufferAllocationTest,
diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp
index 6202fdc77efd5f..de5b02c3c8349f 100644
--- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp
+++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp
@@ -70,8 +70,8 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP
 
     void ApplyTransformations(bool is_optimized, bool with_split_loops) {
         ov::snippets::lowered::pass::PassPipeline pipeline;
-        pipeline.register_pass<ov::intel_cpu::pass::BrgemmBlocking>();
         pipeline.register_pass<ov::snippets::lowered::pass::MarkLoops>(m_vector_size);
+        pipeline.register_pass<ov::intel_cpu::pass::BrgemmBlocking>();
         pipeline.register_pass<ov::snippets::lowered::pass::SoftmaxDecomposition>(m_vector_size);
         pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
         if (with_split_loops)
@@ -120,7 +120,7 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP
 class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
 protected:
     std::shared_ptr<ov::Model> GetModel() const override {
-        const auto subtensor_scalar = std::vector<size_t>{1, 1};
+        const auto subtensor_scalar = std::vector<size_t>{1};
         const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
         const auto subtensor_full = std::vector<size_t>(2, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM);
 
@@ -136,10 +136,12 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
 
         const auto brgemm_copyb0 = std::make_shared<ov::intel_cpu::BrgemmCopyB>(
             convert1, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0);
-        const auto scratch0 = std::make_shared<ov::snippets::op::Buffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
+        const auto scratch0 = std::make_shared<ov::snippets::op::NewMemoryBuffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
         const auto brgemm_cpu0 = std::make_shared<ov::intel_cpu::BrgemmCPU>(
             parameter0, brgemm_copyb0->output(0), scratch0, ov::intel_cpu::BrgemmCPU::Type::AMX);
         brgemm_cpu0->set_m_block_size(32);
+        brgemm_cpu0->set_k_block_size(16);
+        brgemm_cpu0->set_n_block_size(64);
 
         const auto relu1 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu0);
         const auto softmax = std::make_shared<ov::op::v1::Softmax>(relu1, 3);
@@ -147,10 +149,12 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
 
         const auto brgemm_copyb1 = std::make_shared<ov::intel_cpu::BrgemmCopyB>(
             parameter2, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0);
-        const auto scratch1 = std::make_shared<ov::snippets::op::Buffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
+        const auto scratch1 = std::make_shared<ov::snippets::op::NewMemoryBuffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
         const auto brgemm_cpu1 = std::make_shared<ov::intel_cpu::BrgemmCPU>(
             convert2, brgemm_copyb1->output(0), scratch1, ov::intel_cpu::BrgemmCPU::Type::AMX);
         brgemm_cpu1->set_m_block_size(32);
+        brgemm_cpu1->set_k_block_size(16);
+        brgemm_cpu1->set_n_block_size(64);
 
         const auto relu2 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu1);
 
@@ -191,7 +195,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHA
                                  ::testing::Values(true),
                                  ::testing::Values(true),
                                  ::testing::Values(90112),
-                                 ::testing::Values(4)),
+                                 ::testing::Values(5)),
                          BufferAllocationCPUTest::getTestCaseName);
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABF16AMXBufferAllocationTest,