From 2699590c2bc4b0458a6165586af4c593c2cd735b Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 30 Nov 2023 15:42:51 +0100 Subject: [PATCH] Post rebase fixes --- .../src/lowered/pass/identify_buffers.cpp | 15 ++---- .../src/lowered/pass/insert_broadcastmove.cpp | 3 +- .../src/lowered/pass/insert_buffers.cpp | 54 ++++++++++++++----- .../src/lowered/pass/buffer_allocation.cpp | 8 +-- .../lowered/buffer_allocation.cpp | 14 +++-- 5 files changed, 58 insertions(+), 36 deletions(-) diff --git a/src/common/snippets/src/lowered/pass/identify_buffers.cpp b/src/common/snippets/src/lowered/pass/identify_buffers.cpp index 1e1310864fa717..7b19693c7c3b7c 100644 --- a/src/common/snippets/src/lowered/pass/identify_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/identify_buffers.cpp @@ -74,19 +74,14 @@ void IdentifyBuffers::update_adj_matrix(const std::pair IdentifyBuffers::create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool) { - // There are several sync points for adjacency check: - // 1. Loop because only in Loop we increment pointers. So if some Buffers in the one Loop have conflict - // (cannot be inplace: the different ptr increment and data sizes) they are called as adjacent - // 2. Brgemm because its blocking implementation requires Buffers with unique memory on all inputs and outputs + // The sync point to check for adjacency is Loop because only in Loop we increment pointers. + // So if some Buffers in the one Loop have conflict (cannot be inplace: the different ptr increment and data sizes) + // they are called as adjacent const auto size = pool.size(); std::vector adj(size * size, false); for (size_t i = 0; i < size; ++i) adj[index(size, i, i)] = true; - auto is_buffer = [](const ExpressionPort& port) { - return ov::is_type(port.get_expr()->get_node()); - }; - for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) { const auto &expr = *expr_it; const auto& loop_end = ov::as_type_ptr(expr->get_node()); @@ -207,9 +202,7 @@ auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector& adj) -> s bool IdentifyBuffers::run(LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::IdentifyBuffers") - // Unite Buffers using Graph coloring algorithm. - // Notes: We identify only Buffer with Intermediate memory because Buffers with new memory are used only in Brgemm case - // so these Buffers are always IntermediateBuffer nonadjacent + // Identify Buffers using Graph coloring algorithm. BufferPool buffer_pool; for (const auto& expr : linear_ir) { diff --git a/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp b/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp index a39536d990d41b..723b97b5a25788 100644 --- a/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp +++ b/src/common/snippets/src/lowered/pass/insert_broadcastmove.cpp @@ -56,8 +56,7 @@ bool InsertBroadcastMove::run(LinearIR& linear_ir) { OPENVINO_ASSERT(last_dims[i] == 1, "Attempt to broadcast non-1 dimension. Target dim: ", broadcasted_dim, " This dim: ", last_dims[i]); - const auto bcast_dim = ov::Dimension(last_dims[i]); - const auto broadcast = std::make_shared(node->get_input_source_output(i), bcast_dim); + const auto broadcast = std::make_shared(node->get_input_source_output(i), broadcasted_dim); PortDescriptorUtils::set_port_descriptor_ptr(broadcast->output(0), connectors[i]->get_source().get_descriptor_ptr()->clone()); const auto broadcast_expr = linear_ir.create_expression(broadcast, {connectors[i]}); diff --git a/src/common/snippets/src/lowered/pass/insert_buffers.cpp b/src/common/snippets/src/lowered/pass/insert_buffers.cpp index 9b8657217a6c41..81835a4ca390ae 100644 --- a/src/common/snippets/src/lowered/pass/insert_buffers.cpp +++ b/src/common/snippets/src/lowered/pass/insert_buffers.cpp @@ -49,26 +49,52 @@ ov::Shape compute_allocation_shape(const LinearIR::LoopManagerPtr& loop_manager, return allocation_shape; } - auto set_rest_dims_to_ones = [&](const int filled_dims_count) { - for (int i = 0; i < static_cast(allocation_shape.size()) - filled_dims_count; ++i) { - allocation_shape[i] = 1; + // If subtensor is set, its information is used for allocation shape computation. Two situations are possible: + // 1. Buffer is outside the parent loop: the corresponding subtensor value is ignored, parent loop work amount is set instead + // 2. Buffer is inside the parent loop: the corresponding subtensor value is used in allocation shape. + // Since we can defenitely know which subtensor value corresponds to the loop only for 1st case + // (we can extract this info from loop exit port), we copy subtensor, and then replace subtensor values with parent loop work amount if needed. + // Example: + // Parent subtensor: [M_blk, N_blk] + // Buffer loop idces: [M_loop_idx], parent loop idces: [M_loop_idx, N_loop_idx] + // + // 1. Allocation shape is set to subtensor: [M_blk, N_blk] + // 2. Buffer is inside M_loop_idx loop => allocation shape is not changed + // 3. Buffer is outside N_loop_idx loop => the corresponding allocation shape value is replaced with N loop work amount + // So the result allocation shape is [M_blk, N_loop_work_amount] + const auto& subtensor = expr_port.get_descriptor_ptr()->get_subtensor(); + if (!subtensor.empty()) { + for (size_t i = 0; i < std::min(rank, subtensor.size()); ++i) { + auto& cur_val = *(allocation_shape.rbegin() + i); + const auto& subtensor_val = *(subtensor.rbegin() + i); + cur_val = std::min(cur_val, subtensor_val); + } + for (const auto& parent_loop : parent_loop_ids) { + if (std::find(buffer_loop_ids.begin(), buffer_loop_ids.end(), parent_loop) == buffer_loop_ids.end()) { + const auto loop_info = loop_manager->get_loop_info(parent_loop); + const auto& exit_points = loop_info->get_exit_points(); + auto it = std::find_if(exit_points.begin(), + exit_points.end(), + [&expr_port](const LinearIR::LoopManager::LoopPort& port) { + return *port.expr_port == expr_port; + }); + OPENVINO_ASSERT(it != exit_points.end(), "compute_allocation_shape: exit point of parent loop can not be found"); + const auto& loop_port = *it; + if (loop_port.is_incremented && loop_port.dim_idx < allocation_shape.size()) { + *(allocation_shape.rbegin() + loop_port.dim_idx) = loop_info->get_work_amount(); + } + } } - }; - - // In some cases it's possible to allocate less shape - // 1. Buffer and its parent are in the same loop: allocation size for the outer dimension can be extracted from loop increment - // 2. Buffer is outside the parent's loops: allocation size can be extracted from the corresponding loop work amount - // TODO: Use general logic with the help of memory counts for allocation shape computation - if (buffer_loop_ids.back() == parent_loop_ids.back()) { - const auto buffer_loop = loop_manager->get_loop_info(buffer_loop_ids.back()); - *(allocation_shape.rbegin() + 1) = buffer_loop->get_increment(); - set_rest_dims_to_ones(2); } else { + // WA: In case of empty subtensors another information have to be used to update allocation shape. for (size_t i = 0; i < std::min(rank, parent_loop_ids.size()); ++i) { const auto loop = loop_manager->get_loop_info(*(parent_loop_ids.rbegin() + i)); + OPENVINO_ASSERT(loop->get_dim_idx() == i, "compute_allocation_shape: eltwise loop has unexpected dimension index"); *(allocation_shape.rbegin() + i) = loop->get_work_amount(); } - set_rest_dims_to_ones(static_cast(parent_loop_ids.size())); + for (int i = 0; i < allocation_rank - static_cast(parent_loop_ids.size()); ++i) { + allocation_shape[i] = 1; + } } return allocation_shape; } diff --git a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp index 4dc6ac8d365208..fe887cecd96f17 100644 --- a/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp +++ b/src/common/snippets/tests/src/lowered/pass/buffer_allocation.cpp @@ -92,9 +92,9 @@ std::shared_ptr EltwiseBufferAllocationTest::GetModel() const { const auto parameter0 = std::make_shared(ov::element::f32, ov::PartialShape({1, 3, 100, 100})); const auto parameter1 = std::make_shared(ov::element::f32, ov::PartialShape({1, 3, 100, 100})); const auto add = std::make_shared(parameter0, parameter1); - const auto buffer0 = std::make_shared(add, static_cast(subtensor_buffer.size())); + const auto buffer0 = std::make_shared(add, static_cast(subtensor_buffer.size())); const auto relu = std::make_shared(buffer0); - const auto buffer1 = std::make_shared(relu, static_cast(subtensor_buffer.size())); + const auto buffer1 = std::make_shared(relu, static_cast(subtensor_buffer.size())); const auto exp = std::make_shared(buffer1); const auto body = std::make_shared(std::make_shared(exp), ov::ParameterVector{parameter0, parameter1}); @@ -119,7 +119,7 @@ void MHABufferAllocationTest::MarkBrgemm(const std::shared_ptr MHABufferAllocationTest::GetModel() const { - const auto subtensor_scalar = std::vector{1, 1}; + const auto subtensor_scalar = std::vector{1}; const auto subtensor_eltwise = std::vector{1, m_vector_size}; const auto subtensor_brgemm = std::vector{32, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM}; const auto subtensor_softmax = std::vector{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM}; @@ -187,7 +187,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHA ::testing::Values(true), ::testing::Values(true), ::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm) - ::testing::Values(3)), // (Buffer before brgemm) + (between brgemms) + (after brgemm) + ::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms) BufferAllocationTest::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABufferAllocationTest, diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp index 6202fdc77efd5f..de5b02c3c8349f 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/lowered/buffer_allocation.cpp @@ -70,8 +70,8 @@ class BufferAllocationCPUTest : public testing::TestWithParam(); pipeline.register_pass(m_vector_size); + pipeline.register_pass(); pipeline.register_pass(m_vector_size); pipeline.register_pass(); if (with_split_loops) @@ -120,7 +120,7 @@ class BufferAllocationCPUTest : public testing::TestWithParam GetModel() const override { - const auto subtensor_scalar = std::vector{1, 1}; + const auto subtensor_scalar = std::vector{1}; const auto subtensor_softmax = std::vector{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM}; const auto subtensor_full = std::vector(2, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM); @@ -136,10 +136,12 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const auto brgemm_copyb0 = std::make_shared( convert1, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0); - const auto scratch0 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); + const auto scratch0 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); const auto brgemm_cpu0 = std::make_shared( parameter0, brgemm_copyb0->output(0), scratch0, ov::intel_cpu::BrgemmCPU::Type::AMX); brgemm_cpu0->set_m_block_size(32); + brgemm_cpu0->set_k_block_size(16); + brgemm_cpu0->set_n_block_size(64); const auto relu1 = std::make_shared(brgemm_cpu0); const auto softmax = std::make_shared(relu1, 3); @@ -147,10 +149,12 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest { const auto brgemm_copyb1 = std::make_shared( parameter2, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0); - const auto scratch1 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); + const auto scratch1 = std::make_shared(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE}); const auto brgemm_cpu1 = std::make_shared( convert2, brgemm_copyb1->output(0), scratch1, ov::intel_cpu::BrgemmCPU::Type::AMX); brgemm_cpu1->set_m_block_size(32); + brgemm_cpu1->set_k_block_size(16); + brgemm_cpu1->set_n_block_size(64); const auto relu2 = std::make_shared(brgemm_cpu1); @@ -191,7 +195,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHA ::testing::Values(true), ::testing::Values(true), ::testing::Values(90112), - ::testing::Values(4)), + ::testing::Values(5)), BufferAllocationCPUTest::getTestCaseName); INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABF16AMXBufferAllocationTest,