From 63ba621f5af7aace5f0295188a317aa8a813a83c Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 16 Oct 2024 15:45:35 +0200 Subject: [PATCH 01/42] [Snippets][WIP] Move CopyB repacking out from Subgraph --- .../include/snippets/runtime_configurator.hpp | 6 ++ .../snippets/include/snippets/utils/utils.hpp | 10 +++ .../snippets/src/runtime_configurator.cpp | 35 +++++---- .../snippets/cpu_runtime_configurator.cpp | 66 +++++++++++++++++ .../snippets/cpu_runtime_configurator.hpp | 6 ++ src/plugins/intel_cpu/src/nodes/reorder.cpp | 6 -- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 72 ++++++++++++++++--- src/plugins/intel_cpu/src/nodes/subgraph.h | 17 ++++- .../snippets/x64/op/brgemm_cpu.cpp | 21 +----- .../snippets/x64/op/brgemm_cpu.hpp | 1 - .../snippets/x64/op/brgemm_utils.hpp | 2 +- .../x64/pass/lowered/brgemm_cpu_blocking.cpp | 27 ++++--- .../x64/pass/move_brgemm_repacking_out.cpp | 61 ++++++++++++++++ .../x64/pass/move_brgemm_repacking_out.hpp | 22 ++++++ .../common_test_utils/src/ov_tensor_utils.cpp | 4 +- 15 files changed, 291 insertions(+), 65 deletions(-) create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.hpp diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 10c15a4621a72a..7c8d03ea0c20d7 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -157,6 +157,12 @@ class RuntimeConfigurator { */ std::vector> extract_layouts() const; + static void compute_offsets(const ov::snippets::VectorDims& shape, + ov::snippets::VectorDims& offsets, + size_t offsets_size, + size_t dim_step, + size_t idx_stride); + class MHAParallelWAOptimizer { public: MHAParallelWAOptimizer() = default; diff --git a/src/common/snippets/include/snippets/utils/utils.hpp b/src/common/snippets/include/snippets/utils/utils.hpp index ff4646f24d03b7..d885f057f8687f 100644 --- a/src/common/snippets/include/snippets/utils/utils.hpp +++ b/src/common/snippets/include/snippets/utils/utils.hpp @@ -311,6 +311,16 @@ void visit_path(const lowered::ExpressionPtr& expr, std::function func, bool visit_parent_path); +/** + * @brief Checks if layout is planar + */ +inline bool is_planar_layout(const std::vector& layout) { + for (size_t i = 0; i < layout.size(); ++i) + if (layout[i] != i) + return false; + return true; +} + } // namespace utils } // namespace snippets } // namespace ov diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 9174d93eea3f98..b0e10c567a0fc6 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -261,7 +261,7 @@ void RuntimeConfigurator::update_data_offsets(const std::vector& sha const std::vector>& layouts) const { OPENVINO_ASSERT(shapes.size() == m_io_num, "Number of custom shapes must be 0 or be equal to m_io_num"); OPENVINO_ASSERT(layouts.size() == m_io_num, "Number of custom layouts must be 0 or be equal to m_io_num"); - for (size_t i = 0; i < m_io_num; ++i) { + for (size_t i = 0; i < m_io_num; ++i) { // offsets represent distance between consecutive elements of corresponding dimension. // If a dim size == 1, then the next dim starts immediately and the stride is 0 // case 1: @@ -271,26 +271,17 @@ void RuntimeConfigurator::update_data_offsets(const std::vector& sha // shape: s0, s1, s2 == 1, s3 // offsets: s1*s3, s3, 0, 1 const auto& shape = shapes[i]; + OPENVINO_ASSERT(m_config->tensor_rank >= shape.size(), "Incorrect tensor rank!"); if (shape == m_latest_shapes[i]) continue; - - const auto& layout = layouts[i]; - auto& offsets = m_config->io_data_offsets[i]; - - offsets.resize(m_config->tensor_rank); - std::fill(offsets.begin(), offsets.end(), 0); if (utils::is_dynamic_vdims(shape)) return; - size_t dim_step = m_io_data_sizes[i]; - offsets[offsets.size() - 1] = dim_step; - - OPENVINO_ASSERT(m_config->tensor_rank >= shape.size(), "Incorrect tensor rank!"); + auto& offsets = m_config->io_data_offsets[i]; const auto idx_stride = m_config->tensor_rank - shape.size(); - for (int i = static_cast(shape.size()) - 2; i >= 0; i--) { - dim_step *= shape[i + 1]; - offsets[i + idx_stride] = shape[i] != 1 ? dim_step : 0; - } + compute_offsets(shape, offsets, m_config->tensor_rank, m_io_data_sizes[i], idx_stride); + + const auto& layout = layouts[i]; if (!layout.empty()) { std::vector reordered_offsets(offsets.size()); const auto is_input = i < m_in_num; @@ -318,6 +309,20 @@ std::vector> RuntimeConfigurator::extract_layouts() const { return layouts; } +void RuntimeConfigurator::compute_offsets(const ov::snippets::VectorDims& shape, + ov::snippets::VectorDims& offsets, + size_t offsets_size, + size_t dim_step, + size_t idx_stride) { + offsets.resize(offsets_size); + std::fill(offsets.begin(), offsets.end(), 0); + offsets[offsets.size() - 1] = dim_step; + for (int i = static_cast(shape.size()) - 2; i >= 0; i--) { + dim_step *= shape[i + 1]; + offsets[i + idx_stride] = shape[i] != 1 ? dim_step : 0; + } +} + void RuntimeConfigurator::set_kernel_executor_table(std::shared_ptr table) const { OPENVINO_ASSERT(table, "Failed to update Kernel Executo Table: passed table is missed"); m_config->kernel_executor_table = std::move(table); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 1c3d283ab673b1..f38e68e3a3746b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -4,10 +4,15 @@ #include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "memory_desc/cpu_blocked_memory_desc.h" +#include "memory_desc/cpu_memory_desc_utils.h" +#include "memory_desc/dnnl_blocked_memory_desc.h" #include "snippets/lowered/loop_manager.hpp" #include "snippets/utils/utils.hpp" #ifndef OPENVINO_ARCH_ARM64 +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" #include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp" #endif namespace ov { @@ -47,6 +52,7 @@ void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearI } void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { + update_requested_descs(linear_ir); m_config->master_shape = linear_ir->get_master_shape(); if (linear_ir->is_dynamic()) { update_loop_info(linear_ir); @@ -69,6 +75,7 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l if (linear_ir->is_dynamic()) { update_loop_args(linear_ir); } + adjust_offsets_from_descs(linear_ir); } void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { @@ -136,5 +143,64 @@ void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() { } #endif +void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const { + const auto& cpu_config = ov::as_type_ptr(m_config); + auto& optimal_descs = cpu_config->m_in_requested_descs; + optimal_descs.resize(m_in_num); + const auto& params = linear_ir->get_parameters(); + OPENVINO_ASSERT(params.size() == m_in_num); + for (size_t i = 0; i < m_in_num; ++i) { + const auto& param = params[i]; + auto consumers = param->get_output_port_connector(0)->get_consumers(); + const bool brgemm_with_extracted_repacking = + std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) { + auto brgemm = ov::as_type_ptr(port.get_expr()->get_node()); + return port.get_index() == 1 && brgemm && brgemm_utils::with_repacking(brgemm->get_type()); + }); + if (brgemm_with_extracted_repacking) { + const auto& desc = param->get_output_port_descriptor(0); + const auto& shape = desc->get_shape(); + const auto& K = *++shape.rbegin(); + const auto& N = *shape.rbegin(); + + const auto& precision = param->get_node()->get_output_element_type(0); + const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); + // Firstly, batch dims are set + VectorDims requested_blocked_shape(shape.begin(), shape.end() - m_config->tile_rank); + // Then, the blocked dims are formed + requested_blocked_shape.insert( + requested_blocked_shape.end(), + {snippets::utils::div_up(K, vnni_factor), std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)), vnni_factor}); + + VectorDims requested_order(shape.size() - m_config->tile_rank); + std::iota(requested_order.begin(), requested_order.end(), 0); + const auto last_idx = shape.size() - 1; + requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); + + auto cpu_desc = std::make_shared(precision, Shape(shape), requested_blocked_shape, requested_order); + optimal_descs[i] = MemoryDescUtils::convertToDnnlMemoryDesc(cpu_desc); + } + } +} +void CPURuntimeConfigurator::adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const { + const auto& cpu_config = ov::as_type_ptr(m_config); + auto& optimal_descs = cpu_config->m_in_requested_descs; + for (size_t i = 0; i < m_in_num; ++i) { + const auto& optimal_desc = optimal_descs[i]; + if (optimal_desc) { + // It is assumed that shape is planar + const auto& parameter = linear_ir->get_parameters()[i]; + const auto& original_shape = parameter->get_output_port_descriptor(0)->get_shape(); + const auto& blocked_shape = optimal_desc->as()->getBlockDims(); + + ov::snippets::VectorDims shape_for_offset(m_config->tensor_rank - original_shape.size(), 1); + shape_for_offset.insert(shape_for_offset.end(), blocked_shape.begin(), blocked_shape.end()); + auto& offsets = m_config->io_data_offsets[i]; + compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_io_data_sizes[i], 0); + OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(parameter->get_output_port_descriptor(0)->get_layout())); + } + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index d8ef9772e813ff..80d575c9f09f3a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -9,6 +9,8 @@ #include "snippets/lowered/port_descriptor.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" +#include "memory_desc/cpu_memory_desc.h" + namespace ov { namespace intel_cpu { @@ -22,6 +24,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { #endif std::vector loop_args = {}; + std::vector m_in_requested_descs = {}; }; class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { @@ -51,6 +54,9 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { */ void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; + void update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; + void adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; + static const size_t rank6D; class BrgemmCopyBLoopPortsAdjuster { diff --git a/src/plugins/intel_cpu/src/nodes/reorder.cpp b/src/plugins/intel_cpu/src/nodes/reorder.cpp index 9b521cdb3b57c7..7257e31369bd66 100644 --- a/src/plugins/intel_cpu/src/nodes/reorder.cpp +++ b/src/plugins/intel_cpu/src/nodes/reorder.cpp @@ -17,13 +17,7 @@ #include #include -#include "convert.h" #include "cpu/x64/cpu_isa_traits.hpp" -#include "nodes/common/cpu_convert.h" -#include "nodes/common/cpu_memcpy.h" -#include "nodes/common/reorder_prim.h" -#include "openvino/core/parallel.hpp" -#include "shape_inference/shape_inference_pass_through.hpp" #include "utils/precision_support.h" #include "nodes/executors/executor.hpp" #include "nodes/executors/transpose_list.hpp" diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index ee24dd66493204..474c29556fe6a5 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -3,6 +3,10 @@ // #include "subgraph.h" +#include "nodes/reorder.h" +#include "nodes/common/reorder_prim.h" +#include "memory_desc/dnnl_blocked_memory_desc.h" +#include "memory_desc/cpu_memory_desc_utils.h" #include "common/primitive_hashing_utils.hpp" #include "dnnl_extension_utils.h" #include "onednn/dnnl.h" @@ -35,6 +39,7 @@ #include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp" #include "transformations/snippets/x64/pass/remove_converts.hpp" #include "transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp" +#include "transformations/snippets/x64/pass/move_brgemm_repacking_out.hpp" #include "transformations/snippets/x64/pass/enforce_precision.hpp" #include "transformations/snippets/x64/shape_inference.hpp" #include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp" @@ -76,10 +81,11 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor { const std::vector& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator) - : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {} + const BufferScratchpadAllocator& allocator, + const DnnlScratchPadPtr& scratchpad) + : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator, scratchpad) {} - void exec(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { const auto& callable = m_schedule->get_callable(); auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { @@ -119,15 +125,16 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { const std::vector& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator) - : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) { + const BufferScratchpadAllocator& allocator, + const DnnlScratchPadPtr& scratchpad) + : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator, scratchpad) { buffer_offsets = snippet_config->buffer_cluster_offsets; data_offsets = snippet_config->io_data_offsets; loop_args = snippet_config->loop_args; reset_exec_table_state = snippet_config->kernel_executor_table->get_state_reset(); } - void exec(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { + void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { const auto& callable = m_schedule->get_callable(); OPENVINO_ASSERT(data_offsets.size() == inMemPtrs.size() + outMemPtrs.size(), "Incorrect data offset count!"); @@ -648,6 +655,9 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() { } SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::Before, ov::snippets::pass::PropagatePrecision, ov::intel_cpu::pass::BrgemmToBrgemmCPU); + if (!std::getenv("REFERENCE")) + SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::After, ov::intel_cpu::pass::BrgemmToBrgemmCPU, + ov::intel_cpu::pass::MoveBrgemmRepackingOut); SNIPPETS_REGISTER_PASS_ABSOLUTE_X86_64(Place::PipelineEnd, ov::intel_cpu::pass::RemoveConverts); SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON(Place::PipelineEnd, ov::intel_cpu::pass::MulAddToFMA); @@ -782,7 +792,13 @@ void Subgraph::prepareParams() { snippet->get_runtime_configurator()->set_kernel_executor_table(code_gen->get()->lowering_result.kernel_executor_table); } const auto& snippet_config = ov::as_type_ptr(snippet->update_runtime_config()); - return std::make_shared(key.attrs, code_gen, start_offset_in, start_offset_out, snippet_config, allocator); + return std::make_shared(key.attrs, + code_gen, + start_offset_in, + start_offset_out, + snippet_config, + allocator, + context->getScratchPad()); } else { // Static case: // 1. Update runtime config to get static scheduling data (io data offsets, parallel domain) which will be compiled in JIT code @@ -793,7 +809,13 @@ void Subgraph::prepareParams() { [&snippet_config](const SubgraphCodeGeneratorKey& key) -> std::shared_ptr { return std::make_shared(key.attrs, snippet_config); }); - return std::make_shared(key.attrs, code_gen_result.first, start_offset_in, start_offset_out, snippet_config, allocator); + return std::make_shared(key.attrs, + code_gen_result.first, + start_offset_in, + start_offset_out, + snippet_config, + allocator, + context->getScratchPad()); } }; @@ -846,7 +868,7 @@ bool Subgraph::created() const { void Subgraph::execute(dnnl::stream strm) { OPENVINO_ASSERT(execPtr, "Can't execute Subgraph node. Primitive didn't created"); - execPtr->exec(srcMemPtrs, dstMemPtrs); + execPtr->execute(strm, srcMemPtrs, dstMemPtrs); } void Subgraph::executeDynamicImpl(dnnl::stream strm) { @@ -882,8 +904,9 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator) - : m_schedule(snippet->get()), m_start_offset_in(start_offset_in), m_start_offset_out(start_offset_out) { + const BufferScratchpadAllocator& allocator, + const DnnlScratchPadPtr& scratchpad) + : m_schedule(snippet->get()), m_start_offset_in(start_offset_in), m_start_offset_out(start_offset_out), m_scratchpad(scratchpad) { OPENVINO_ASSERT(m_schedule, "Schedule is empty!"); OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!"); init_parallel_domain(snippet_config, m_parallel_exec_domain); @@ -895,6 +918,12 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr(m_nthreads) * m_buffer_scratchpad_size); + const auto& requested_descs = snippet_config->m_in_requested_descs; + m_requested_repackings.resize(requested_descs.size()); + for (size_t i = 0; i < requested_descs.size(); ++i) { + m_requested_repackings[i].requested_desc = requested_descs[i]; + } + #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) const auto target = std::dynamic_pointer_cast(snippet_attrs->snippet->get_generator()->get_target_machine()); enabled_segfault_detector = target && target->debug_config.enable_segfault_detector; @@ -970,6 +999,27 @@ void Subgraph::SubgraphExecutor::parallel_forNd(const std::function& inMemPtrs, std::vector& outMemPtrs) { + repack_inputs(strm, inMemPtrs); + exec_impl(inMemPtrs, outMemPtrs); +} + +void Subgraph::SubgraphExecutor::repack_inputs(dnnl::stream strm, std::vector& inMemPtrs) { + OPENVINO_ASSERT(inMemPtrs.size() == m_requested_repackings.size()); + for (size_t i = 0; i < m_requested_repackings.size(); ++i) { + const auto& requested_desc = m_requested_repackings[i].requested_desc; + auto& scratch_mem = m_requested_repackings[i].scratch_mem; + if (requested_desc) { + if (!scratch_mem || !scratch_mem->getDesc().isCompatible(*requested_desc)) { + scratch_mem = m_scratchpad->createScratchPadMem(requested_desc); + std::cout << "scratch_mem is created for requested desc " << i << std::endl; + } + scratch_mem->load(*inMemPtrs[i]); + inMemPtrs[i] = scratch_mem; + } + } +} + } // namespace node } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index ffd7944c59d48a..606c75a360f16f 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -126,12 +126,15 @@ class Subgraph::SubgraphExecutor { const std::vector& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator); + const BufferScratchpadAllocator& allocator, + const DnnlScratchPadPtr& scratchpad); virtual ~SubgraphExecutor() = default; - virtual void exec(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; + void execute(dnnl::stream strm, std::vector& inMemPtrs, std::vector& outMemPtrs); protected: + virtual void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; + void parallel_for6d(const std::function& initializer, const std::function& caller); void parallel_forNd(const std::function& initializer, @@ -164,6 +167,16 @@ class Subgraph::SubgraphExecutor { bool enabled_segfault_detector = false; inline void segfault_detector(); #endif + +private: + void repack_inputs(dnnl::stream strm, std::vector& inMemPtrs); + + struct RequestedRepacking { + MemoryDescPtr requested_desc = {}; + MemoryPtr scratch_mem = {}; + }; + std::vector m_requested_repackings = {}; + DnnlScratchPadPtr m_scratchpad = {}; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp index b40bd88f31726b..0dac2c5dc4d809 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp @@ -68,13 +68,10 @@ void BrgemmCPU::custom_constructor_validate_and_infer_types(std::vector INTERNAL_OP_SCOPE(BrgemmCPU_constructor_validate_and_infer_types); validate_inputs(); - // During ctor call, BrgemmCPU doesn't know his port descriptors. - // So we use port descs from source inputs - const auto brgemm_copy = with_repacking(m_type) ? get_brgemm_copy() : nullptr; + // This shape inference can use get_input_partial_shape(1) in all cases const auto planar_input_shapes = std::vector{ snippets::utils::get_planar_pshape(get_input_partial_shape(0), layout_a), - brgemm_copy ? snippets::utils::get_planar_pshape(brgemm_copy->input(0)) - : snippets::utils::get_planar_pshape(get_input_partial_shape(1), layout_b) }; + snippets::utils::get_planar_pshape(get_input_partial_shape(1), layout_b) }; auto output_shape = infer_output_partial_shape(planar_input_shapes); set_output_type(0, get_output_type(), snippets::utils::get_planar_pshape(output_shape, layout_c)); @@ -130,20 +127,6 @@ std::shared_ptr BrgemmCPU::clone_with_new_inputs(const OutputVector& new_a } } -std::shared_ptr BrgemmCPU::get_brgemm_copy() const { - OPENVINO_ASSERT(one_of(m_type, BRGEMM_TYPE::REPACKING_ONLY, BRGEMM_TYPE::WITH_COMPENSATIONS, BRGEMM_TYPE::WITH_AMX), "Brgemm doesn't need BrgemmCopyB"); - auto b_input_node = get_input_node_shared_ptr(1); - if (const auto brgemm_copy_b = ov::as_type_ptr(b_input_node)) { - return brgemm_copy_b; - } - if (ov::is_type(b_input_node)) { - if (const auto brgemm_copy_b = ov::as_type_ptr(b_input_node->get_input_node_shared_ptr(0))) { - return brgemm_copy_b; - } - } - OPENVINO_THROW("BrgemmCopyB hasn't been found!"); -} - size_t BrgemmCPU::get_offset_scratch() const { OPENVINO_ASSERT(with_scratchpad(m_type) && get_input_size() == 3, "Offset of scratchpad must be only in Brgemm with scratchpad on 3rd input"); return get_input_offset(2); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp index a646ffc792fd6d..a781bc7ddd4e15 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.hpp @@ -44,7 +44,6 @@ class BrgemmCPU : public snippets::op::Brgemm { BRGEMM_TYPE get_type() const { return m_type; } size_t get_offset_scratch() const; - std::shared_ptr get_brgemm_copy() const; bool visit_attributes(AttributeVisitor& visitor) override; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp index aeb5b22cd56129..eccb8cfdb7c479 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp @@ -18,7 +18,7 @@ enum class BRGEMM_TYPE { STAND_ALONE, // No extra requirements, used for f32|f32 WITH_AMX, // i8|i8 or bf16|bf16 on AMX system - needs BrgemmCopyB and scratchpad WITH_COMPENSATIONS, // i8|i8 (non-AMX system) - needs BrgemmCopyB for data repacking and compensations - REPACKING_ONLY // u8|i8 or bf16|bf16 (non-AMX system) - needs BrgemmCopyB on second input for data repacking + REPACKING_ONLY, // low precision or some specific f32 cases - needs BrgemmCopyB on second input for data repacking }; dnnl::impl::cpu::x64::cpu_isa_t get_primitive_isa(const ov::element::Type& dt_in0, bool is_with_amx); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp index 9b3009284e09e8..f9bab5ca5b96d7 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp @@ -83,11 +83,25 @@ bool BrgemmCPUBlocking::mark_blocking_loops(LinearIR& linear_ir, if (stand_alone(type)) return res; - const auto copy_b_expr = linear_ir.get_expr_by_node(brgemm->get_brgemm_copy()); - const ov::snippets::VectorDims full_subtensor(2, get_full_dim_value()); - copy_b_expr->get_input_port_descriptor(0)->set_subtensor(full_subtensor); - copy_b_expr->get_output_port_descriptor(0)->set_subtensor(full_subtensor); - + ExpressionPtr copy_b_expr; + const auto b_input_expr = brgemm_expr->get_input_port_connector(1)->get_source().get_expr(); + if (ov::is_type(b_input_expr->get_node())) { + copy_b_expr = b_input_expr; + } else if (ov::is_type(b_input_expr->get_node())) { + const auto input_buffer_expr = b_input_expr->get_input_port_connector(0)->get_source().get_expr(); + if (ov::is_type(b_input_expr->get_node())) + copy_b_expr = input_buffer_expr; + } + if (copy_b_expr) { + copy_b_expr->get_input_port_descriptor(0)->set_subtensor({get_full_dim_value(), get_full_dim_value()}); + copy_b_expr->get_output_port_descriptor(0)->set_subtensor({get_full_dim_value(), get_full_dim_value()}); + if (with_compensations(type)) { + const ov::snippets::VectorDims compensations_subtensor{1, get_full_dim_value()}; + OPENVINO_ASSERT(brgemm_expr->get_input_count() == 3, "Brgemm must have 3 inputs in case of compensations."); + brgemm_expr->get_input_port_descriptor(2)->set_subtensor(compensations_subtensor); + copy_b_expr->get_output_port_descriptor(1)->set_subtensor(compensations_subtensor); + } + } if (with_amx(type)) { move_new_memory_buffer(linear_ir, brgemm_it); auto buffer_it = std::prev(brgemm_it); @@ -96,11 +110,8 @@ bool BrgemmCPUBlocking::mark_blocking_loops(LinearIR& linear_ir, const auto& loop_manager = linear_ir.get_loop_manager(); if (with_compensations(type)) { - const ov::snippets::VectorDims compensations_subtensor{1, get_full_dim_value()}; OPENVINO_ASSERT(brgemm_expr->get_input_count() == 3, "Brgemm must have 3 inputs in case of compensations."); const auto& compens_port = brgemm_expr->get_input_port(2); - compens_port.get_descriptor_ptr()->set_subtensor(compensations_subtensor); - copy_b_expr->get_output_port_descriptor(1)->set_subtensor(compensations_subtensor); const auto& loop_ids = brgemm_expr->get_loop_ids(); size_t i = 0; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp new file mode 100644 index 00000000000000..0dd8e8a1fb5dc2 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp @@ -0,0 +1,61 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/itt.hpp" + +#include "move_brgemm_repacking_out.hpp" + +#include "snippets/utils/utils.hpp" +#include "snippets/op/brgemm.hpp" +#include "snippets/op/buffer.hpp" +#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/tpp/x64/op/modifiers.hpp" + +#include "openvino/core/rt_info.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/matcher.hpp" + +#include "cpu/x64/cpu_isa_traits.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" + +#include "cpu_shape.h" +#include "utils/general_utils.h" + + +namespace ov { +namespace intel_cpu { + +using namespace snippets::lowered; + + +pass::MoveBrgemmRepackingOut::MoveBrgemmRepackingOut() { + MATCHER_SCOPE(MoveBrgemmRepackingOut); + auto m_param = ov::pass::pattern::wrap_type(); + auto m_copy_b = ov::pass::pattern::wrap_type({m_param}); + + auto callback = [=](ov::pass::pattern::Matcher& m) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::MoveBrgemmRepackingOut") + const auto& pattern_map = m.get_pattern_value_map(); + const auto& copy_b_in = pattern_map.at(m_param); + const auto& copy_b_out = pattern_map.at(m_copy_b); + const auto copy_b_node = copy_b_out.get_node_shared_ptr(); + + const auto& in_desc = PortDescriptorUtils::get_port_descriptor_ptr(copy_b_node->input(0)); + const auto& layout = in_desc->get_layout(); + // TODO: + // 1. handle copyB with compensations + // 2. handle non-planar layout + if (!ov::snippets::utils::is_planar_layout(layout) || copy_b_node->get_output_size() != 1 || + transformation_callback(copy_b_node)) + return false; + std::cout << "[ INFO ] MoveBrgemmRepackingOut is finished\n"; + return ov::replace_output_update_name(copy_b_out, copy_b_in); + }; + + auto m = std::make_shared(m_copy_b, matcher_name); + register_matcher(m, callback); +} +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.hpp new file mode 100644 index 00000000000000..c82193c93f1d4b --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { + +class MoveBrgemmRepackingOut: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("MoveBrgemmRepackingOut", "0"); + MoveBrgemmRepackingOut(); +}; + + +} // namespace pass +} // namespace intel_cpu +} // namespace ov diff --git a/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp b/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp index 133959fd9fdc6b..7cbf02eb4bf143 100644 --- a/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp +++ b/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp @@ -444,9 +444,9 @@ class Error { << " Diff: " << std::fabs(val.expected_value - val.actual_value) << " calculated_abs_threshold: " << val.threshold << " abs_threshold: " << abs_threshold << " rel_threshold: " << rel_threshold << "\n"; -#ifdef NDEBUG +// #ifdef NDEBUG break; -#endif +// #endif } throw std::runtime_error(msg); } else if (!less_or_equal(mvn_results, mvn_threshold)) { From 74c4557457ed4422a995acb7c3bd4bbf2e12a419 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 7 Nov 2024 08:26:22 +0100 Subject: [PATCH 02/42] Disable CopyB moving out for i8i8 case --- .../snippets/x64/pass/move_brgemm_repacking_out.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp index 0dd8e8a1fb5dc2..eb68356c7c6094 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp @@ -40,14 +40,15 @@ pass::MoveBrgemmRepackingOut::MoveBrgemmRepackingOut() { const auto& pattern_map = m.get_pattern_value_map(); const auto& copy_b_in = pattern_map.at(m_param); const auto& copy_b_out = pattern_map.at(m_copy_b); - const auto copy_b_node = copy_b_out.get_node_shared_ptr(); + const auto copy_b_node = ov::as_type_ptr(copy_b_out.get_node_shared_ptr()); + OPENVINO_ASSERT(copy_b_node, "BrgemmCopyB node is null in MoveBrgemmRepackingOut transformation"); const auto& in_desc = PortDescriptorUtils::get_port_descriptor_ptr(copy_b_node->input(0)); const auto& layout = in_desc->get_layout(); // TODO: // 1. handle copyB with compensations // 2. handle non-planar layout - if (!ov::snippets::utils::is_planar_layout(layout) || copy_b_node->get_output_size() != 1 || + if (!ov::snippets::utils::is_planar_layout(layout) || copy_b_node->get_src_element_type() == ov::element::i8 || transformation_callback(copy_b_node)) return false; std::cout << "[ INFO ] MoveBrgemmRepackingOut is finished\n"; From 8e66e61338ed5a2080718ed8776d882bbb9d0caf Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 7 Nov 2024 08:30:26 +0100 Subject: [PATCH 03/42] [TMP] Avoid createScratchPadMem usage --- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 474c29556fe6a5..14ac98821264b0 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -1011,7 +1011,8 @@ void Subgraph::SubgraphExecutor::repack_inputs(dnnl::stream strm, std::vectorgetDesc().isCompatible(*requested_desc)) { - scratch_mem = m_scratchpad->createScratchPadMem(requested_desc); + // scratch_mem = m_scratchpad->createScratchPadMem(requested_desc); + scratch_mem = std::make_shared(strm.get_engine(), requested_desc); std::cout << "scratch_mem is created for requested desc " << i << std::endl; } scratch_mem->load(*inMemPtrs[i]); From 07539112286c819567daba70d753bfce85425ec4 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 7 Nov 2024 17:17:41 +0100 Subject: [PATCH 04/42] update_ptrs fix for dynamic --- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 20 ++++++++++---------- src/plugins/intel_cpu/src/nodes/subgraph.h | 4 ++-- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 14ac98821264b0..02e126a27a9a60 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -91,8 +91,8 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor { auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { init_call_args(call_args, inMemPtrs, outMemPtrs, ithr); }; - auto caller = [&](jit_snippets_call_args& call_args, const size_t* indexes) { - callable(&call_args, indexes); + auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes) { + callable(&call_args, indexes.data()); }; if (m_parallel_exec_domain.size() == rank6D) { @@ -151,7 +151,7 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { init_call_args(call_args, ithr); }; - auto caller = [&](jit_snippets_call_args& call_args, const size_t* indexes) { + auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes) { update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); callable(&call_args); }; @@ -188,17 +188,17 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { } inline void update_ptrs(jit_snippets_call_args& call_args, const std::vector& src_ptrs, - const std::vector& dst_ptrs, const size_t* indexes) const { + const std::vector& dst_ptrs, const std::vector& indexes) const { for (size_t i = 0; i < src_ptrs.size(); i++) { auto i_ptr = src_ptrs[i]; - for (size_t j = 0; j < data_offsets[i].size() - 1; j++) { + for (size_t j = 0; j < indexes.size(); j++) { i_ptr += data_offsets[i][j] * indexes[j]; } call_args.src_ptrs[i] = i_ptr; } for (size_t i = 0; i < dst_ptrs.size(); i++) { auto i_ptr = dst_ptrs[i]; - for (size_t j = 0; j < data_offsets[i + src_ptrs.size()].size() - 1; j++) { + for (size_t j = 0; j < indexes.size(); j++) { i_ptr += data_offsets[i + src_ptrs.size()][j] * indexes[j]; } call_args.dst_ptrs[i] = i_ptr; @@ -948,7 +948,7 @@ void Subgraph::SubgraphExecutor::segfault_detector() { #endif void Subgraph::SubgraphExecutor::parallel_for6d(const std::function& initializer, - const std::function& caller) { + const std::function&)>& caller) { const auto& dom = m_parallel_exec_domain; #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) @@ -962,7 +962,7 @@ void Subgraph::SubgraphExecutor::parallel_for6d(const std::function indexes{0, 0, 0, 0, 0}; parallel_it_init(start, indexes[0], dom[0], indexes[1], dom[1], indexes[2], dom[2], indexes[3], dom[3], indexes[4], dom[4]); for (size_t iwork = start; iwork < end; ++iwork) { caller(call_args, indexes); @@ -972,7 +972,7 @@ void Subgraph::SubgraphExecutor::parallel_for6d(const std::function& initializer, - const std::function& caller) { + const std::function&)>& caller) { const auto& dom = m_parallel_exec_domain; #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) @@ -994,7 +994,7 @@ void Subgraph::SubgraphExecutor::parallel_forNd(const std::function& inMemPtrs, const std::vector& outMemPtrs) = 0; void parallel_for6d(const std::function& initializer, - const std::function& caller); + const std::function&)>& caller); void parallel_forNd(const std::function& initializer, - const std::function& caller); + const std::function&)>& caller); inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const { if (m_buffer_scratchpad_size > 0) From 6a7ad3280bb538ae7b389e94edecec915b58bd99 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 7 Nov 2024 18:30:24 +0100 Subject: [PATCH 05/42] Fix original memptrs corruption in dynamic scenario --- .../snippets/cpu_runtime_configurator.cpp | 5 ++- .../snippets/cpu_runtime_configurator.hpp | 2 +- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 31 ++++++++++--------- src/plugins/intel_cpu/src/nodes/subgraph.h | 5 +-- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index f38e68e3a3746b..0f4a35d9a1786f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -146,7 +146,6 @@ void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() { void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const { const auto& cpu_config = ov::as_type_ptr(m_config); auto& optimal_descs = cpu_config->m_in_requested_descs; - optimal_descs.resize(m_in_num); const auto& params = linear_ir->get_parameters(); OPENVINO_ASSERT(params.size() == m_in_num); for (size_t i = 0; i < m_in_num; ++i) { @@ -186,8 +185,8 @@ void CPURuntimeConfigurator::adjust_offsets_from_descs(const ov::snippets::lower const auto& cpu_config = ov::as_type_ptr(m_config); auto& optimal_descs = cpu_config->m_in_requested_descs; for (size_t i = 0; i < m_in_num; ++i) { - const auto& optimal_desc = optimal_descs[i]; - if (optimal_desc) { + if (optimal_descs.count(i)) { + const auto& optimal_desc = optimal_descs[i]; // It is assumed that shape is planar const auto& parameter = linear_ir->get_parameters()[i]; const auto& original_shape = parameter->get_output_port_descriptor(0)->get_shape(); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 80d575c9f09f3a..dabdd9c3c31f9f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -24,7 +24,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { #endif std::vector loop_args = {}; - std::vector m_in_requested_descs = {}; + std::unordered_map m_in_requested_descs = {}; }; class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 02e126a27a9a60..bc774f924b08dc 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -918,10 +918,10 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr(m_nthreads) * m_buffer_scratchpad_size); - const auto& requested_descs = snippet_config->m_in_requested_descs; - m_requested_repackings.resize(requested_descs.size()); - for (size_t i = 0; i < requested_descs.size(); ++i) { - m_requested_repackings[i].requested_desc = requested_descs[i]; + // TODO: here we need to already create memory, preliminary provide to allocator the adjusted scracth size + for (const auto& desc : snippet_config->m_in_requested_descs) { + const auto& requested_desc = desc.second; + m_in_requested_repackings.emplace(desc.first, RequestedRepacking(requested_desc, nullptr)); } #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) @@ -1000,25 +1000,28 @@ void Subgraph::SubgraphExecutor::parallel_forNd(const std::function& inMemPtrs, std::vector& outMemPtrs) { - repack_inputs(strm, inMemPtrs); - exec_impl(inMemPtrs, outMemPtrs); + if (m_in_requested_repackings.empty()) + exec_impl(inMemPtrs, outMemPtrs); + else + reorder_execute(strm, inMemPtrs, outMemPtrs); } -void Subgraph::SubgraphExecutor::repack_inputs(dnnl::stream strm, std::vector& inMemPtrs) { - OPENVINO_ASSERT(inMemPtrs.size() == m_requested_repackings.size()); - for (size_t i = 0; i < m_requested_repackings.size(); ++i) { - const auto& requested_desc = m_requested_repackings[i].requested_desc; - auto& scratch_mem = m_requested_repackings[i].scratch_mem; +void Subgraph::SubgraphExecutor::reorder_execute(dnnl::stream strm, std::vector inMemPtrs, const std::vector& outMemPtrs) { + for (auto& requested_repacking : m_in_requested_repackings) { + const auto& requested_desc = requested_repacking.second.requested_desc; + auto& scratch_mem = requested_repacking.second.scratch_mem; if (requested_desc) { if (!scratch_mem || !scratch_mem->getDesc().isCompatible(*requested_desc)) { + // TODO: move to prepareParams and investigate why the repacking is called on each iteration // scratch_mem = m_scratchpad->createScratchPadMem(requested_desc); scratch_mem = std::make_shared(strm.get_engine(), requested_desc); - std::cout << "scratch_mem is created for requested desc " << i << std::endl; + std::cout << "scratch_mem is created for requested desc " << requested_repacking.first << std::endl; } - scratch_mem->load(*inMemPtrs[i]); - inMemPtrs[i] = scratch_mem; + scratch_mem->load(*inMemPtrs[requested_repacking.first]); + inMemPtrs[requested_repacking.first] = scratch_mem; } } + exec_impl(inMemPtrs, outMemPtrs); } } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 2398c9083aa032..24a36db1b4fc59 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -169,13 +169,14 @@ class Subgraph::SubgraphExecutor { #endif private: - void repack_inputs(dnnl::stream strm, std::vector& inMemPtrs); + void reorder_execute(dnnl::stream strm, std::vector inMemPtrs, const std::vector& outMemPtrs); struct RequestedRepacking { + RequestedRepacking(MemoryDescPtr desc, MemoryPtr memory) : requested_desc(desc), scratch_mem(memory) {} MemoryDescPtr requested_desc = {}; MemoryPtr scratch_mem = {}; }; - std::vector m_requested_repackings = {}; + std::unordered_map m_in_requested_repackings = {}; DnnlScratchPadPtr m_scratchpad = {}; }; From bdf1e9a8b0f5f8cbff37c44b5b77bab1db0eaa15 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 8 Nov 2024 12:37:09 +0100 Subject: [PATCH 06/42] Compilation fix --- src/common/snippets/include/snippets/utils/utils.hpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/src/common/snippets/include/snippets/utils/utils.hpp b/src/common/snippets/include/snippets/utils/utils.hpp index d885f057f8687f..ff4646f24d03b7 100644 --- a/src/common/snippets/include/snippets/utils/utils.hpp +++ b/src/common/snippets/include/snippets/utils/utils.hpp @@ -311,16 +311,6 @@ void visit_path(const lowered::ExpressionPtr& expr, std::function func, bool visit_parent_path); -/** - * @brief Checks if layout is planar - */ -inline bool is_planar_layout(const std::vector& layout) { - for (size_t i = 0; i < layout.size(); ++i) - if (layout[i] != i) - return false; - return true; -} - } // namespace utils } // namespace snippets } // namespace ov From a8c558ea2dc98ea4bc95987b646852b494171f32 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 8 Nov 2024 15:45:37 +0100 Subject: [PATCH 07/42] Propagate updated shapes from SplitM to desc adjuster --- .../include/snippets/runtime_configurator.hpp | 2 +- .../snippets/src/runtime_configurator.cpp | 21 ++++++----- .../snippets/cpu_runtime_configurator.cpp | 35 ++++++++++--------- .../snippets/cpu_runtime_configurator.hpp | 8 +++-- 4 files changed, 36 insertions(+), 30 deletions(-) diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 7c8d03ea0c20d7..55679bc0745530 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -171,7 +171,7 @@ class RuntimeConfigurator { * @brief Checks if the current master shape can be optimized, and if yes, updates all the necessary runtime information * @return status if the optimization is applied */ - bool optimize(); + bool optimize(std::vector& shapes, std::vector>& layots); private: /** diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index b0e10c567a0fc6..c894a7ebfc1817 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -70,17 +70,17 @@ void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { m_config->master_shape = linear_ir->get_master_shape(); update_loop_info(linear_ir); - if (!m_optimizer.optimize()) { - // If the optimization was not applied, offsets are updated using shapes from descriptors - auto shapes = extract_shapes(); - update_data_offsets(shapes, extract_layouts()); - m_latest_shapes = std::move(shapes); - } + auto shapes = extract_shapes(); + auto layouts = extract_layouts(); + m_optimizer.optimize(shapes, layouts); + + update_data_offsets(shapes, layouts); // Update KernelExecutor Table should be before `update_buffer_scratchpad_size` // because `ComputeAllocationSize` depends on subtensors which are updated in the table get_kernel_executor_table()->update_state(linear_ir); update_buffer_scratchpad_size(linear_ir); + m_latest_shapes = std::move(shapes); } void RuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { @@ -382,7 +382,8 @@ bool RuntimeConfigurator::MHAParallelWAOptimizer::enabled() const { return !loops_to_split.empty(); } -bool RuntimeConfigurator::MHAParallelWAOptimizer::optimize() { +bool RuntimeConfigurator::MHAParallelWAOptimizer::optimize(std::vector& shapes, + std::vector>& layouts) { OPENVINO_ASSERT(configurator != nullptr, "Configurator is nullptr"); if (!enabled()) return false; @@ -390,7 +391,7 @@ bool RuntimeConfigurator::MHAParallelWAOptimizer::optimize() { size_t new_batch_dim, new_kernel_dim; if (!SplitDimensionM::split(configurator->m_config->master_shape, concurrency, new_batch_dim, new_kernel_dim)) return false; - + std::cout << "[ INFO ] MHAParallelWAOptimizer works\n"; auto& master_shape = configurator->m_config->master_shape; *++master_shape.rbegin() = new_kernel_dim; master_shape.insert(master_shape.cbegin() + master_shape.size() - 2, new_batch_dim); @@ -416,14 +417,12 @@ bool RuntimeConfigurator::MHAParallelWAOptimizer::optimize() { loop->apply(updater, updated_loops); } - auto shapes = configurator->extract_shapes(); for (size_t i = 0; i < configurator->m_io_num; ++i) { shapes[i] = unsqueezed_params.count(i) ? SplitDimensionM::unsqueeze_m_dim(shapes[i], m_dim_idces[i]) : SplitDimensionM::reshape_m_dim(shapes[i], m_dim_idces[i], new_batch_dim, new_kernel_dim); } - configurator->update_data_offsets(shapes, optimized_layouts); - configurator->m_latest_shapes = std::move(shapes); + layouts = optimized_layouts; return true; } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 0f4a35d9a1786f..1e65f97eafadb6 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -52,18 +52,17 @@ void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearI } void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { - update_requested_descs(linear_ir); m_config->master_shape = linear_ir->get_master_shape(); if (linear_ir->is_dynamic()) { update_loop_info(linear_ir); } - if (!m_optimizer.optimize()) { - // If the optimization was not applied, offsets are updated using shapes from descriptors - auto shapes = extract_shapes(); - update_data_offsets(shapes, extract_layouts()); - m_latest_shapes = std::move(shapes); - } + auto shapes = extract_shapes(); + auto layouts = extract_layouts(); + m_optimizer.optimize(shapes, layouts); + // Why must it be called before kernel executor table update? + update_requested_descs(linear_ir, shapes, layouts); + if (linear_ir->is_dynamic()) loopPortsAdjuster.optimize(); @@ -75,7 +74,9 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l if (linear_ir->is_dynamic()) { update_loop_args(linear_ir); } - adjust_offsets_from_descs(linear_ir); + update_data_offsets(shapes, layouts); + adjust_offsets_from_descs(linear_ir, shapes, layouts); + m_latest_shapes = std::move(shapes); } void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { @@ -143,7 +144,9 @@ void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() { } #endif -void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const { +void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir, + const std::vector& shapes, + const std::vector>& layots) const { const auto& cpu_config = ov::as_type_ptr(m_config); auto& optimal_descs = cpu_config->m_in_requested_descs; const auto& params = linear_ir->get_parameters(); @@ -157,8 +160,8 @@ void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered: return port.get_index() == 1 && brgemm && brgemm_utils::with_repacking(brgemm->get_type()); }); if (brgemm_with_extracted_repacking) { - const auto& desc = param->get_output_port_descriptor(0); - const auto& shape = desc->get_shape(); + const auto& shape = shapes[i]; + // TODO: support orbitrary order const auto& K = *++shape.rbegin(); const auto& N = *shape.rbegin(); @@ -181,22 +184,22 @@ void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered: } } } -void CPURuntimeConfigurator::adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const { +void CPURuntimeConfigurator::adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir, + const std::vector& shapes, + const std::vector>& layouts) const { const auto& cpu_config = ov::as_type_ptr(m_config); auto& optimal_descs = cpu_config->m_in_requested_descs; for (size_t i = 0; i < m_in_num; ++i) { if (optimal_descs.count(i)) { const auto& optimal_desc = optimal_descs[i]; - // It is assumed that shape is planar - const auto& parameter = linear_ir->get_parameters()[i]; - const auto& original_shape = parameter->get_output_port_descriptor(0)->get_shape(); + const auto& original_shape = shapes[i]; const auto& blocked_shape = optimal_desc->as()->getBlockDims(); ov::snippets::VectorDims shape_for_offset(m_config->tensor_rank - original_shape.size(), 1); shape_for_offset.insert(shape_for_offset.end(), blocked_shape.begin(), blocked_shape.end()); auto& offsets = m_config->io_data_offsets[i]; compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_io_data_sizes[i], 0); - OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(parameter->get_output_port_descriptor(0)->get_layout())); + OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(layouts[i])); } } } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index dabdd9c3c31f9f..4ca122796f6d04 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -54,8 +54,12 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { */ void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; - void update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; - void adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; + void update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir, + const std::vector& shapes, + const std::vector>& layouts) const; + void adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir, + const std::vector& shapes, + const std::vector>& layouts) const; static const size_t rank6D; From 00dd68fc53a3e7c5ff75cdfa6a458e907ad53b18 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 8 Nov 2024 15:45:56 +0100 Subject: [PATCH 08/42] Move brgemm repacking out fix --- .../x64/pass/move_brgemm_repacking_out.cpp | 32 ++++++------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp index eb68356c7c6094..6853e9c6ba7928 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp @@ -1,29 +1,17 @@ -// Copyright (C) 2018-2022 Intel Corporation +// Copyright (C) 2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -#include "snippets/itt.hpp" - #include "move_brgemm_repacking_out.hpp" -#include "snippets/utils/utils.hpp" -#include "snippets/op/brgemm.hpp" -#include "snippets/op/buffer.hpp" -#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" -#include "transformations/snippets/x64/op/brgemm_cpu.hpp" -#include "transformations/tpp/x64/op/modifiers.hpp" - -#include "openvino/core/rt_info.hpp" -#include "openvino/pass/pattern/op/wrap_type.hpp" -#include "openvino/pass/pattern/matcher.hpp" - #include "cpu/x64/cpu_isa_traits.hpp" +#include "openvino/pass/pattern/matcher.hpp" +#include "openvino/pass/pattern/op/wrap_type.hpp" +#include "snippets/itt.hpp" +#include "snippets/op/rank_normalization.hpp" +#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_utils.hpp" -#include "cpu_shape.h" -#include "utils/general_utils.h" - - namespace ov { namespace intel_cpu { @@ -38,7 +26,6 @@ pass::MoveBrgemmRepackingOut::MoveBrgemmRepackingOut() { auto callback = [=](ov::pass::pattern::Matcher& m) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::MoveBrgemmRepackingOut") const auto& pattern_map = m.get_pattern_value_map(); - const auto& copy_b_in = pattern_map.at(m_param); const auto& copy_b_out = pattern_map.at(m_copy_b); const auto copy_b_node = ov::as_type_ptr(copy_b_out.get_node_shared_ptr()); OPENVINO_ASSERT(copy_b_node, "BrgemmCopyB node is null in MoveBrgemmRepackingOut transformation"); @@ -48,11 +35,12 @@ pass::MoveBrgemmRepackingOut::MoveBrgemmRepackingOut() { // TODO: // 1. handle copyB with compensations // 2. handle non-planar layout - if (!ov::snippets::utils::is_planar_layout(layout) || copy_b_node->get_src_element_type() == ov::element::i8 || - transformation_callback(copy_b_node)) + if (!ov::snippets::utils::is_planar_layout(layout) || + copy_b_node->get_src_element_type() == ov::element::i8 || transformation_callback(copy_b_node)) return false; + std::cout << "copy_b_node = " << copy_b_node << std::endl; std::cout << "[ INFO ] MoveBrgemmRepackingOut is finished\n"; - return ov::replace_output_update_name(copy_b_out, copy_b_in); + return ov::replace_output_update_name(copy_b_out, copy_b_node->input_value(0)); }; auto m = std::make_shared(m_copy_b, matcher_name); From fa14643d084beb76016f5ad4a12224c5caf8c2d1 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Sun, 10 Nov 2024 22:35:30 +0100 Subject: [PATCH 09/42] Codestyle --- .../test_utils/common_test_utils/src/ov_tensor_utils.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp b/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp index 7cbf02eb4bf143..133959fd9fdc6b 100644 --- a/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp +++ b/src/tests/test_utils/common_test_utils/src/ov_tensor_utils.cpp @@ -444,9 +444,9 @@ class Error { << " Diff: " << std::fabs(val.expected_value - val.actual_value) << " calculated_abs_threshold: " << val.threshold << " abs_threshold: " << abs_threshold << " rel_threshold: " << rel_threshold << "\n"; -// #ifdef NDEBUG +#ifdef NDEBUG break; -// #endif +#endif } throw std::runtime_error(msg); } else if (!less_or_equal(mvn_results, mvn_threshold)) { From 013cc2fb1028414e60afdf4930d739b07fdbad03 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Mon, 11 Nov 2024 11:39:51 +0100 Subject: [PATCH 10/42] Scratchpad reused for intermediate repackings --- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 50 ++++++++++---------- src/plugins/intel_cpu/src/nodes/subgraph.h | 10 +--- 2 files changed, 28 insertions(+), 32 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index bc774f924b08dc..34d3ad1dd766a6 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -82,8 +82,8 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor { const std::vector& start_offset_out, const std::shared_ptr& snippet_config, const BufferScratchpadAllocator& allocator, - const DnnlScratchPadPtr& scratchpad) - : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator, scratchpad) {} + const dnnl::engine& engine) + : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator, engine) {} void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { const auto& callable = m_schedule->get_callable(); @@ -126,8 +126,8 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { const std::vector& start_offset_out, const std::shared_ptr& snippet_config, const BufferScratchpadAllocator& allocator, - const DnnlScratchPadPtr& scratchpad) - : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator, scratchpad) { + const dnnl::engine& engine) + : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator, engine) { buffer_offsets = snippet_config->buffer_cluster_offsets; data_offsets = snippet_config->io_data_offsets; loop_args = snippet_config->loop_args; @@ -798,7 +798,7 @@ void Subgraph::prepareParams() { start_offset_out, snippet_config, allocator, - context->getScratchPad()); + getEngine()); } else { // Static case: // 1. Update runtime config to get static scheduling data (io data offsets, parallel domain) which will be compiled in JIT code @@ -815,7 +815,7 @@ void Subgraph::prepareParams() { start_offset_out, snippet_config, allocator, - context->getScratchPad()); + getEngine()); } }; @@ -905,8 +905,8 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& start_offset_out, const std::shared_ptr& snippet_config, const BufferScratchpadAllocator& allocator, - const DnnlScratchPadPtr& scratchpad) - : m_schedule(snippet->get()), m_start_offset_in(start_offset_in), m_start_offset_out(start_offset_out), m_scratchpad(scratchpad) { + const dnnl::engine& engine) + : m_schedule(snippet->get()), m_start_offset_in(start_offset_in), m_start_offset_out(start_offset_out) { OPENVINO_ASSERT(m_schedule, "Schedule is empty!"); OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!"); init_parallel_domain(snippet_config, m_parallel_exec_domain); @@ -916,12 +916,23 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptrbuffer_scratchpad_size; OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), "Undefined buffer scratchpad size!"); - m_buffer_scratchpad = allocator(static_cast(m_nthreads) * m_buffer_scratchpad_size); - - // TODO: here we need to already create memory, preliminary provide to allocator the adjusted scracth size + const auto internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; + const auto external_repacking_buffer_size = + std::accumulate(snippet_config->m_in_requested_descs.begin(), + snippet_config->m_in_requested_descs.end(), + size_t(0), + [](size_t sum, const std::pair& requested_desc_elem) { + return sum + requested_desc_elem.second->getCurrentMemSize(); + }); + m_buffer_scratchpad = allocator(internal_buffer_size + external_repacking_buffer_size); + + size_t offset = internal_buffer_size; for (const auto& desc : snippet_config->m_in_requested_descs) { const auto& requested_desc = desc.second; - m_in_requested_repackings.emplace(desc.first, RequestedRepacking(requested_desc, nullptr)); + const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; + m_in_requested_repackings[desc.first] = std::make_shared(engine, requested_desc, data_ptr); + offset += requested_desc->getCurrentMemSize(); + std::cout << "scratch_mem is created for requested desc " << desc.first << std::endl; } #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) @@ -1008,18 +1019,9 @@ void Subgraph::SubgraphExecutor::execute(dnnl::stream strm, std::vector inMemPtrs, const std::vector& outMemPtrs) { for (auto& requested_repacking : m_in_requested_repackings) { - const auto& requested_desc = requested_repacking.second.requested_desc; - auto& scratch_mem = requested_repacking.second.scratch_mem; - if (requested_desc) { - if (!scratch_mem || !scratch_mem->getDesc().isCompatible(*requested_desc)) { - // TODO: move to prepareParams and investigate why the repacking is called on each iteration - // scratch_mem = m_scratchpad->createScratchPadMem(requested_desc); - scratch_mem = std::make_shared(strm.get_engine(), requested_desc); - std::cout << "scratch_mem is created for requested desc " << requested_repacking.first << std::endl; - } - scratch_mem->load(*inMemPtrs[requested_repacking.first]); - inMemPtrs[requested_repacking.first] = scratch_mem; - } + const auto& scratch_mem = requested_repacking.second; + scratch_mem->load(*inMemPtrs[requested_repacking.first]); + inMemPtrs[requested_repacking.first] = scratch_mem; } exec_impl(inMemPtrs, outMemPtrs); } diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 24a36db1b4fc59..42ccec8d25c643 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -127,7 +127,7 @@ class Subgraph::SubgraphExecutor { const std::vector& start_offset_out, const std::shared_ptr& snippet_config, const BufferScratchpadAllocator& allocator, - const DnnlScratchPadPtr& scratchpad); + const dnnl::engine& engine); virtual ~SubgraphExecutor() = default; void execute(dnnl::stream strm, std::vector& inMemPtrs, std::vector& outMemPtrs); @@ -171,13 +171,7 @@ class Subgraph::SubgraphExecutor { private: void reorder_execute(dnnl::stream strm, std::vector inMemPtrs, const std::vector& outMemPtrs); - struct RequestedRepacking { - RequestedRepacking(MemoryDescPtr desc, MemoryPtr memory) : requested_desc(desc), scratch_mem(memory) {} - MemoryDescPtr requested_desc = {}; - MemoryPtr scratch_mem = {}; - }; - std::unordered_map m_in_requested_repackings = {}; - DnnlScratchPadPtr m_scratchpad = {}; + std::unordered_map m_in_requested_repackings = {}; }; } // namespace node From 43c939dffc6e0d41e9d989470d7df131239e0e28 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Mon, 11 Nov 2024 15:18:12 +0100 Subject: [PATCH 11/42] Recreate memory object for external repacking on each inference --- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 34d3ad1dd766a6..4d0f057b8f643b 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -932,7 +932,8 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptrgetDataAs() + offset; m_in_requested_repackings[desc.first] = std::make_shared(engine, requested_desc, data_ptr); offset += requested_desc->getCurrentMemSize(); - std::cout << "scratch_mem is created for requested desc " << desc.first << std::endl; + std::cout << "scratch_mem is created for requested desc " << desc.first + << ", ptr = " << m_in_requested_repackings[desc.first]->getData() << std::endl; } #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) @@ -1018,6 +1019,22 @@ void Subgraph::SubgraphExecutor::execute(dnnl::stream strm, std::vector inMemPtrs, const std::vector& outMemPtrs) { + std::cout << "[ INFO ] Reorder execute is called\n"; + // TODO: discuss whether it is applicable to create new memory object from scratchpad on each inference + // As an alternative option, the separate memory object (not from scratchpad) can be created once on Executor constructor stage + const auto internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; + size_t offset = internal_buffer_size; + for (auto& intermediate_memory : m_in_requested_repackings) { + auto& mem = intermediate_memory.second; + const auto& desc = mem->getDescPtr(); + const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; + mem = std::make_shared(strm.get_engine(), desc, data_ptr, false); + offset += desc->getCurrentMemSize(); + std::cout << "scratch_mem is used for requested desc " << intermediate_memory.first + << ", ptr = " << mem->getData() << std::endl; + std::cout << "m_scratch = " << m_buffer_scratchpad->getData() << std::endl; + } + for (auto& requested_repacking : m_in_requested_repackings) { const auto& scratch_mem = requested_repacking.second; scratch_mem->load(*inMemPtrs[requested_repacking.first]); From 89be0f310d7a39d102b4ca0730e22eb5e8ffd1a8 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Mon, 11 Nov 2024 15:48:34 +0100 Subject: [PATCH 12/42] Cleanup --- src/common/snippets/src/runtime_configurator.cpp | 2 +- .../src/emitters/snippets/cpu_runtime_configurator.cpp | 10 ++++------ .../src/emitters/snippets/cpu_runtime_configurator.hpp | 7 +++---- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index c894a7ebfc1817..08a5f461a93f69 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -261,7 +261,7 @@ void RuntimeConfigurator::update_data_offsets(const std::vector& sha const std::vector>& layouts) const { OPENVINO_ASSERT(shapes.size() == m_io_num, "Number of custom shapes must be 0 or be equal to m_io_num"); OPENVINO_ASSERT(layouts.size() == m_io_num, "Number of custom layouts must be 0 or be equal to m_io_num"); - for (size_t i = 0; i < m_io_num; ++i) { + for (size_t i = 0; i < m_io_num; ++i) { // offsets represent distance between consecutive elements of corresponding dimension. // If a dim size == 1, then the next dim starts immediately and the stride is 0 // case 1: diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 1e65f97eafadb6..56809105c32995 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -75,7 +75,7 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l update_loop_args(linear_ir); } update_data_offsets(shapes, layouts); - adjust_offsets_from_descs(linear_ir, shapes, layouts); + adjust_offsets_from_descs(shapes, layouts); m_latest_shapes = std::move(shapes); } @@ -179,13 +179,11 @@ void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered: const auto last_idx = shape.size() - 1; requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); - auto cpu_desc = std::make_shared(precision, Shape(shape), requested_blocked_shape, requested_order); - optimal_descs[i] = MemoryDescUtils::convertToDnnlMemoryDesc(cpu_desc); + optimal_descs[i] = std::make_shared(precision, Shape(shape), requested_blocked_shape, requested_order); } } } -void CPURuntimeConfigurator::adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::vector& shapes, +void CPURuntimeConfigurator::adjust_offsets_from_descs(const std::vector& shapes, const std::vector>& layouts) const { const auto& cpu_config = ov::as_type_ptr(m_config); auto& optimal_descs = cpu_config->m_in_requested_descs; @@ -193,7 +191,7 @@ void CPURuntimeConfigurator::adjust_offsets_from_descs(const ov::snippets::lower if (optimal_descs.count(i)) { const auto& optimal_desc = optimal_descs[i]; const auto& original_shape = shapes[i]; - const auto& blocked_shape = optimal_desc->as()->getBlockDims(); + const auto& blocked_shape = optimal_desc->getBlockDims(); ov::snippets::VectorDims shape_for_offset(m_config->tensor_rank - original_shape.size(), 1); shape_for_offset.insert(shape_for_offset.end(), blocked_shape.begin(), blocked_shape.end()); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 4ca122796f6d04..c1e3ae499b03a8 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -9,7 +9,7 @@ #include "snippets/lowered/port_descriptor.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" -#include "memory_desc/cpu_memory_desc.h" +#include "memory_desc/cpu_blocked_memory_desc.h" namespace ov { namespace intel_cpu { @@ -24,7 +24,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { #endif std::vector loop_args = {}; - std::unordered_map m_in_requested_descs = {}; + std::unordered_map m_in_requested_descs = {}; }; class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { @@ -57,8 +57,7 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { void update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir, const std::vector& shapes, const std::vector>& layouts) const; - void adjust_offsets_from_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::vector& shapes, + void adjust_offsets_from_descs(const std::vector& shapes, const std::vector>& layouts) const; static const size_t rank6D; From f36b32389c79013f33f7a6e16ed721895970576a Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Mon, 11 Nov 2024 16:06:53 +0100 Subject: [PATCH 13/42] Store descs in SubgraphExecutor --- .../snippets/cpu_runtime_configurator.cpp | 25 +++++----- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 47 +++++++------------ src/plugins/intel_cpu/src/nodes/subgraph.h | 2 +- .../snippets/x64/op/brgemm_cpu.cpp | 7 ++- .../adjust_brgemm_copy_b_loop_ports.cpp | 1 + 5 files changed, 33 insertions(+), 49 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 56809105c32995..89098ab9274545 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -186,19 +186,18 @@ void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered: void CPURuntimeConfigurator::adjust_offsets_from_descs(const std::vector& shapes, const std::vector>& layouts) const { const auto& cpu_config = ov::as_type_ptr(m_config); - auto& optimal_descs = cpu_config->m_in_requested_descs; - for (size_t i = 0; i < m_in_num; ++i) { - if (optimal_descs.count(i)) { - const auto& optimal_desc = optimal_descs[i]; - const auto& original_shape = shapes[i]; - const auto& blocked_shape = optimal_desc->getBlockDims(); - - ov::snippets::VectorDims shape_for_offset(m_config->tensor_rank - original_shape.size(), 1); - shape_for_offset.insert(shape_for_offset.end(), blocked_shape.begin(), blocked_shape.end()); - auto& offsets = m_config->io_data_offsets[i]; - compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_io_data_sizes[i], 0); - OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(layouts[i])); - } + for (const auto& map_elem : cpu_config->m_in_requested_descs) { + const auto input_idx = map_elem.first; + const auto& optimal_desc = map_elem.second; + const auto& original_shape = shapes[input_idx]; + const auto& blocked_shape = optimal_desc->getBlockDims(); + + ov::snippets::VectorDims shape_for_offset(m_config->tensor_rank - original_shape.size(), 1); + shape_for_offset.insert(shape_for_offset.end(), blocked_shape.begin(), blocked_shape.end()); + auto& offsets = m_config->io_data_offsets[input_idx]; + compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_io_data_sizes[input_idx], 0); + // TODO: Support non-planar layout + OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(layouts[input_idx])); } } diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 4d0f057b8f643b..680adb0263a7e2 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -3,8 +3,6 @@ // #include "subgraph.h" -#include "nodes/reorder.h" -#include "nodes/common/reorder_prim.h" #include "memory_desc/dnnl_blocked_memory_desc.h" #include "memory_desc/cpu_memory_desc_utils.h" #include "common/primitive_hashing_utils.hpp" @@ -655,9 +653,8 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() { } SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::Before, ov::snippets::pass::PropagatePrecision, ov::intel_cpu::pass::BrgemmToBrgemmCPU); - if (!std::getenv("REFERENCE")) - SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::After, ov::intel_cpu::pass::BrgemmToBrgemmCPU, - ov::intel_cpu::pass::MoveBrgemmRepackingOut); + SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::After, ov::intel_cpu::pass::BrgemmToBrgemmCPU, + ov::intel_cpu::pass::MoveBrgemmRepackingOut); SNIPPETS_REGISTER_PASS_ABSOLUTE_X86_64(Place::PipelineEnd, ov::intel_cpu::pass::RemoveConverts); SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON(Place::PipelineEnd, ov::intel_cpu::pass::MulAddToFMA); @@ -917,25 +914,16 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptrbuffer_scratchpad_size; OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), "Undefined buffer scratchpad size!"); const auto internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; + m_in_requested_descs = snippet_config->m_in_requested_descs; const auto external_repacking_buffer_size = - std::accumulate(snippet_config->m_in_requested_descs.begin(), - snippet_config->m_in_requested_descs.end(), + std::accumulate(m_in_requested_descs.begin(), + m_in_requested_descs.end(), size_t(0), [](size_t sum, const std::pair& requested_desc_elem) { return sum + requested_desc_elem.second->getCurrentMemSize(); }); m_buffer_scratchpad = allocator(internal_buffer_size + external_repacking_buffer_size); - size_t offset = internal_buffer_size; - for (const auto& desc : snippet_config->m_in_requested_descs) { - const auto& requested_desc = desc.second; - const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; - m_in_requested_repackings[desc.first] = std::make_shared(engine, requested_desc, data_ptr); - offset += requested_desc->getCurrentMemSize(); - std::cout << "scratch_mem is created for requested desc " << desc.first - << ", ptr = " << m_in_requested_repackings[desc.first]->getData() << std::endl; - } - #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) const auto target = std::dynamic_pointer_cast(snippet_attrs->snippet->get_generator()->get_target_machine()); enabled_segfault_detector = target && target->debug_config.enable_segfault_detector; @@ -1012,7 +1000,7 @@ void Subgraph::SubgraphExecutor::parallel_forNd(const std::function& inMemPtrs, std::vector& outMemPtrs) { - if (m_in_requested_repackings.empty()) + if (m_in_requested_descs.empty()) exec_impl(inMemPtrs, outMemPtrs); else reorder_execute(strm, inMemPtrs, outMemPtrs); @@ -1024,22 +1012,19 @@ void Subgraph::SubgraphExecutor::reorder_execute(dnnl::stream strm, std::vector< // As an alternative option, the separate memory object (not from scratchpad) can be created once on Executor constructor stage const auto internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; size_t offset = internal_buffer_size; - for (auto& intermediate_memory : m_in_requested_repackings) { - auto& mem = intermediate_memory.second; - const auto& desc = mem->getDescPtr(); + for (const auto& requested_descs_elem : m_in_requested_descs) { + const auto in_idx = requested_descs_elem.first; + const auto& requested_desc = requested_descs_elem.second; + const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; - mem = std::make_shared(strm.get_engine(), desc, data_ptr, false); - offset += desc->getCurrentMemSize(); - std::cout << "scratch_mem is used for requested desc " << intermediate_memory.first - << ", ptr = " << mem->getData() << std::endl; + const auto scratch_mem = std::make_shared(strm.get_engine(), requested_desc, data_ptr, false); + scratch_mem->load(*inMemPtrs[in_idx]); + inMemPtrs[in_idx] = scratch_mem; + offset += requested_desc->getCurrentMemSize(); + std::cout << "scratch_mem is used for requested desc " << in_idx + << ", ptr = " << scratch_mem->getData() << std::endl; std::cout << "m_scratch = " << m_buffer_scratchpad->getData() << std::endl; } - - for (auto& requested_repacking : m_in_requested_repackings) { - const auto& scratch_mem = requested_repacking.second; - scratch_mem->load(*inMemPtrs[requested_repacking.first]); - inMemPtrs[requested_repacking.first] = scratch_mem; - } exec_impl(inMemPtrs, outMemPtrs); } diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 42ccec8d25c643..5b0eed96080023 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -171,7 +171,7 @@ class Subgraph::SubgraphExecutor { private: void reorder_execute(dnnl::stream strm, std::vector inMemPtrs, const std::vector& outMemPtrs); - std::unordered_map m_in_requested_repackings = {}; + std::unordered_map m_in_requested_descs = {}; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp index 0dac2c5dc4d809..1c3e90bbccf34f 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_cpu.cpp @@ -68,10 +68,9 @@ void BrgemmCPU::custom_constructor_validate_and_infer_types(std::vector INTERNAL_OP_SCOPE(BrgemmCPU_constructor_validate_and_infer_types); validate_inputs(); - // This shape inference can use get_input_partial_shape(1) in all cases - const auto planar_input_shapes = - std::vector{ snippets::utils::get_planar_pshape(get_input_partial_shape(0), layout_a), - snippets::utils::get_planar_pshape(get_input_partial_shape(1), layout_b) }; + const std::vector planar_input_shapes{ + snippets::utils::get_planar_pshape(get_input_partial_shape(0), layout_a), + snippets::utils::get_planar_pshape(get_input_partial_shape(1), layout_b)}; auto output_shape = infer_output_partial_shape(planar_input_shapes); set_output_type(0, get_output_type(), snippets::utils::get_planar_pshape(output_shape, layout_c)); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp index c421e5cc2a4805..f3437dd8019332 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp @@ -67,6 +67,7 @@ bool pass::AdjustBrgemmCopyBLoopPorts::run(const snippets::lowered::LinearIR& li for (const auto& expr : linear_ir) { const auto& node = expr->get_node(); + // TODO: start this logic from BrgemmCPU, not from BrgemmCopyB if (!is_type(node)) continue; const auto& repacking_loop_ids = expr->get_loop_ids(); From a8638177f579dfc85c1723d8c8ab1aadc783d3ed Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Mon, 11 Nov 2024 16:25:27 +0100 Subject: [PATCH 14/42] get_copy_b_expr helper --- .../snippets/x64/op/brgemm_utils.cpp | 16 ++++++++++++++++ .../snippets/x64/op/brgemm_utils.hpp | 6 ++++++ .../x64/pass/lowered/brgemm_cpu_blocking.cpp | 10 +--------- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp index adc215ef1d9900..42d9449b49ce8c 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp @@ -6,7 +6,9 @@ #include "dnnl_extension_utils.h" #include "emitters/utils.hpp" +#include "snippets/op/buffer.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "utils/general_utils.h" using namespace Xbyak; @@ -83,6 +85,20 @@ size_t compute_inner_n_block(const ov::element::Type& precision) { default: OPENVINO_THROW("BrgemmCopyB doesn't support precision ", precision); } } + +const ov::snippets::lowered::ExpressionPtr get_copy_b_expr(const ov::snippets::lowered::ExpressionPtr& brgemm_expr) { + OPENVINO_ASSERT(ov::is_type(brgemm_expr->get_node()), "get_copy_b_expr must be called only for BrgemmCPU node"); + const auto b_input_expr = brgemm_expr->get_input_port_connector(1)->get_source().get_expr(); + if (ov::is_type(b_input_expr->get_node())) { + return b_input_expr; + } else if (ov::is_type(b_input_expr->get_node())) { + const auto input_buffer_expr = b_input_expr->get_input_port_connector(0)->get_source().get_expr(); + if (ov::is_type(b_input_expr->get_node())) { + return input_buffer_expr; + } + } + return nullptr; +} } // namespace repacking } // namespace brgemm_utils } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp index eccb8cfdb7c479..a56d4d23672001 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp @@ -56,6 +56,12 @@ T compute_LDB(T n_block, const ov::element::Type& precision) { n_block : std::max(n_block, static_cast(compute_inner_n_block(precision))); } +/** + * @brief Retrieves the expression pointer for the brgemm_copy_b emitter corresponding to the given BrgemmCPU expression. + * @param brgemm_expr The expression pointer for the BrgemmCPU operation. + * @return The expression pointer for the BrgemmCopyB operation. + */ +const snippets::lowered::ExpressionPtr get_copy_b_expr(const snippets::lowered::ExpressionPtr& brgemm_expr); } // namespace repacking } // namespace brgemm_utils } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp index f9bab5ca5b96d7..692cc6e99a9de3 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.cpp @@ -83,15 +83,7 @@ bool BrgemmCPUBlocking::mark_blocking_loops(LinearIR& linear_ir, if (stand_alone(type)) return res; - ExpressionPtr copy_b_expr; - const auto b_input_expr = brgemm_expr->get_input_port_connector(1)->get_source().get_expr(); - if (ov::is_type(b_input_expr->get_node())) { - copy_b_expr = b_input_expr; - } else if (ov::is_type(b_input_expr->get_node())) { - const auto input_buffer_expr = b_input_expr->get_input_port_connector(0)->get_source().get_expr(); - if (ov::is_type(b_input_expr->get_node())) - copy_b_expr = input_buffer_expr; - } + const auto copy_b_expr = repacking::get_copy_b_expr(brgemm_expr); if (copy_b_expr) { copy_b_expr->get_input_port_descriptor(0)->set_subtensor({get_full_dim_value(), get_full_dim_value()}); copy_b_expr->get_output_port_descriptor(0)->set_subtensor({get_full_dim_value(), get_full_dim_value()}); From 1d1d605b8eda911b5dc8e278c71c6c5706155d38 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 12 Nov 2024 11:20:58 +0100 Subject: [PATCH 15/42] Match AdjustBrgemmCopyBLoopPorts on BrgemmCPU instead of repacking --- .../adjust_brgemm_copy_b_loop_ports.cpp | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp index f3437dd8019332..8d734e288514bf 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp @@ -65,18 +65,27 @@ bool pass::AdjustBrgemmCopyBLoopPorts::run(const snippets::lowered::LinearIR& li bool modified = false; + auto get_repacking_loop_idces = [](const snippets::lowered::ExpressionPtr& parent_expr) { + // Repacking may be extracted outside the snippets kernel. In this case, brgemm parent expression is a parameter. + if (is_type(parent_expr->get_node())) + return std::vector{}; + + OPENVINO_ASSERT(is_type(parent_expr), + "In case of repacking brgemm expr must have BufferExpression on B input"); + const auto buffer_parent_ports = parent_expr->get_input_port(0).get_connected_ports(); + OPENVINO_ASSERT(buffer_parent_ports.size() == 1, + "Parent of brgemm repacking buffer must be connected only to the buffer"); + const auto& repacking_expr = buffer_parent_ports.begin()->get_expr(); + return repacking_expr->get_loop_ids(); + }; + for (const auto& expr : linear_ir) { - const auto& node = expr->get_node(); - // TODO: start this logic from BrgemmCPU, not from BrgemmCopyB - if (!is_type(node)) + const auto brgemm = ov::as_type_ptr(expr->get_node()); + if (!brgemm || !brgemm_utils::with_repacking(brgemm->get_type())) continue; - const auto& repacking_loop_ids = expr->get_loop_ids(); - const auto& child_ports = expr->get_output_port(0).get_connected_ports(); - OPENVINO_ASSERT(child_ports.size() == 1 && - is_type(child_ports.begin()->get_expr()), - "BrgemmCopyB should have one BufferExpression child"); - auto grandchild_ports = child_ports.begin()->get_expr()->get_output_port(0).get_connected_ports(); - for (const auto& target_port : grandchild_ports) { + const auto& parent_expr = expr->get_input_port_connector(1)->get_source().get_expr(); + const auto& repacking_loop_ids = get_repacking_loop_idces(parent_expr); + for (const auto& target_port : parent_expr->get_output_port(0).get_connected_ports()) { const auto& port_node = target_port.get_expr()->get_node(); if (!is_type(port_node)) { OPENVINO_ASSERT(is_type(port_node), From a7cb0fa7be97111639b39d261b22c4d44a8d215d Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 12 Nov 2024 15:59:50 +0100 Subject: [PATCH 16/42] Cleanup --- src/common/snippets/src/runtime_configurator.cpp | 1 - src/plugins/intel_cpu/src/nodes/subgraph.cpp | 6 ------ 2 files changed, 7 deletions(-) diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 08a5f461a93f69..54581e078aaa1e 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -391,7 +391,6 @@ bool RuntimeConfigurator::MHAParallelWAOptimizer::optimize(std::vectorm_config->master_shape, concurrency, new_batch_dim, new_kernel_dim)) return false; - std::cout << "[ INFO ] MHAParallelWAOptimizer works\n"; auto& master_shape = configurator->m_config->master_shape; *++master_shape.rbegin() = new_kernel_dim; master_shape.insert(master_shape.cbegin() + master_shape.size() - 2, new_batch_dim); diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 680adb0263a7e2..d46a6c9c105228 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -1007,9 +1007,6 @@ void Subgraph::SubgraphExecutor::execute(dnnl::stream strm, std::vector inMemPtrs, const std::vector& outMemPtrs) { - std::cout << "[ INFO ] Reorder execute is called\n"; - // TODO: discuss whether it is applicable to create new memory object from scratchpad on each inference - // As an alternative option, the separate memory object (not from scratchpad) can be created once on Executor constructor stage const auto internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; size_t offset = internal_buffer_size; for (const auto& requested_descs_elem : m_in_requested_descs) { @@ -1021,9 +1018,6 @@ void Subgraph::SubgraphExecutor::reorder_execute(dnnl::stream strm, std::vector< scratch_mem->load(*inMemPtrs[in_idx]); inMemPtrs[in_idx] = scratch_mem; offset += requested_desc->getCurrentMemSize(); - std::cout << "scratch_mem is used for requested desc " << in_idx - << ", ptr = " << scratch_mem->getData() << std::endl; - std::cout << "m_scratch = " << m_buffer_scratchpad->getData() << std::endl; } exec_impl(inMemPtrs, outMemPtrs); } From db09212faa5d71dd22c5e4d1de454f1de0537467 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 12 Nov 2024 17:36:33 +0100 Subject: [PATCH 17/42] Introduced BrgemmExternalRepackingAdjuster --- .../snippets/cpu_runtime_configurator.cpp | 108 ++++++++++-------- .../snippets/cpu_runtime_configurator.hpp | 32 +++--- 2 files changed, 72 insertions(+), 68 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 89098ab9274545..17662960793c95 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -49,6 +49,7 @@ void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearI if (linear_ir->is_dynamic()) { loopPortsAdjuster = BrgemmCopyBLoopPortsAdjuster(linear_ir); } + externalRepackingAdjuster = BrgemmExternalRepackingAdjuster(linear_ir, this); } void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { @@ -60,8 +61,6 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l auto shapes = extract_shapes(); auto layouts = extract_layouts(); m_optimizer.optimize(shapes, layouts); - // Why must it be called before kernel executor table update? - update_requested_descs(linear_ir, shapes, layouts); if (linear_ir->is_dynamic()) loopPortsAdjuster.optimize(); @@ -75,7 +74,7 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l update_loop_args(linear_ir); } update_data_offsets(shapes, layouts); - adjust_offsets_from_descs(shapes, layouts); + externalRepackingAdjuster.optimize(linear_ir, shapes, layouts); m_latest_shapes = std::move(shapes); } @@ -110,11 +109,14 @@ void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::Linea } } #ifdef OPENVINO_ARCH_ARM64 -CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { -} - -void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() { -} +CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster( + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + CPURuntimeConfigurator* configurator) {} + +void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize( + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + const std::vector& shapes, + const std::vector>& layouts) {} #else CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { const auto& pass = std::make_shared(); @@ -144,62 +146,68 @@ void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() { } #endif -void CPURuntimeConfigurator::update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::vector& shapes, - const std::vector>& layots) const { - const auto& cpu_config = ov::as_type_ptr(m_config); - auto& optimal_descs = cpu_config->m_in_requested_descs; +#ifdef OPENVINO_ARCH_ARM64 +CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { +} + +void CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::optimize() { +} +#else +CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster( + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + CPURuntimeConfigurator* configurator) : m_configurator(configurator) { const auto& params = linear_ir->get_parameters(); - OPENVINO_ASSERT(params.size() == m_in_num); - for (size_t i = 0; i < m_in_num; ++i) { + for (size_t i = 0; i < params.size(); ++i) { const auto& param = params[i]; - auto consumers = param->get_output_port_connector(0)->get_consumers(); + const auto consumers = param->get_output_port_connector(0)->get_consumers(); const bool brgemm_with_extracted_repacking = std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) { auto brgemm = ov::as_type_ptr(port.get_expr()->get_node()); return port.get_index() == 1 && brgemm && brgemm_utils::with_repacking(brgemm->get_type()); }); if (brgemm_with_extracted_repacking) { - const auto& shape = shapes[i]; - // TODO: support orbitrary order - const auto& K = *++shape.rbegin(); - const auto& N = *shape.rbegin(); - - const auto& precision = param->get_node()->get_output_element_type(0); - const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); - // Firstly, batch dims are set - VectorDims requested_blocked_shape(shape.begin(), shape.end() - m_config->tile_rank); - // Then, the blocked dims are formed - requested_blocked_shape.insert( - requested_blocked_shape.end(), - {snippets::utils::div_up(K, vnni_factor), std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)), vnni_factor}); - - VectorDims requested_order(shape.size() - m_config->tile_rank); - std::iota(requested_order.begin(), requested_order.end(), 0); - const auto last_idx = shape.size() - 1; - requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); - - optimal_descs[i] = std::make_shared(precision, Shape(shape), requested_blocked_shape, requested_order); + m_param_idces_with_external_repacking.insert(i); } } } -void CPURuntimeConfigurator::adjust_offsets_from_descs(const std::vector& shapes, - const std::vector>& layouts) const { - const auto& cpu_config = ov::as_type_ptr(m_config); - for (const auto& map_elem : cpu_config->m_in_requested_descs) { - const auto input_idx = map_elem.first; - const auto& optimal_desc = map_elem.second; - const auto& original_shape = shapes[input_idx]; - const auto& blocked_shape = optimal_desc->getBlockDims(); - - ov::snippets::VectorDims shape_for_offset(m_config->tensor_rank - original_shape.size(), 1); - shape_for_offset.insert(shape_for_offset.end(), blocked_shape.begin(), blocked_shape.end()); - auto& offsets = m_config->io_data_offsets[input_idx]; - compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_io_data_sizes[input_idx], 0); + +void CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::optimize( + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + const std::vector& shapes, + const std::vector>& layouts) { + const auto& cpu_config = ov::as_type_ptr(m_configurator->m_config); + auto& optimal_descs = cpu_config->m_in_requested_descs; + for (const auto& i : m_param_idces_with_external_repacking) { + const auto& shape = shapes[i]; + // TODO: support orbitrary order + const auto& K = *++shape.rbegin(); + const auto& N = *shape.rbegin(); + + const auto& precision = linear_ir->get_parameters()[i]->get_node()->get_output_element_type(0); + const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); + // Firstly, batch dims are set + VectorDims requested_blocked_shape(shape.begin(), shape.end() - m_configurator->m_config->tile_rank); + // Then, the blocked dims are formed + requested_blocked_shape.insert( + requested_blocked_shape.end(), + {snippets::utils::div_up(K, vnni_factor), std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)), vnni_factor}); + + VectorDims requested_order(shape.size() - m_configurator->m_config->tile_rank); + std::iota(requested_order.begin(), requested_order.end(), 0); + const auto last_idx = shape.size() - 1; + requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); + + optimal_descs[i] = std::make_shared(precision, Shape(shape), requested_blocked_shape, requested_order); + + ov::snippets::VectorDims shape_for_offset(m_configurator->m_config->tensor_rank - shape.size(), 1); + shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end()); + auto& offsets = m_configurator->m_config->io_data_offsets[i]; + compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_configurator->m_io_data_sizes[i], 0); // TODO: Support non-planar layout - OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(layouts[input_idx])); + OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(layouts[i])); } } +#endif } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index c1e3ae499b03a8..616cffc57bc58a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -32,20 +32,8 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { CPURuntimeConfigurator(); protected: - /** - * @brief Update RuntimeConfig based on LinearIR - * @param linear_ir LinearIR - */ void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; - /** - * @brief Update tensor rank based on master shape - * @param master_shape Master shape - */ void update_tensor_rank(const ov::snippets::VectorDims& master_shape) override; - /** - * @brief Initializes tensor rank of config - * @param linear_ir LinearIR - */ void init_tensor_rank(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const override; void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; /** @@ -54,12 +42,6 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { */ void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; - void update_requested_descs(const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::vector& shapes, - const std::vector>& layouts) const; - void adjust_offsets_from_descs(const std::vector& shapes, - const std::vector>& layouts) const; - static const size_t rank6D; class BrgemmCopyBLoopPortsAdjuster { @@ -73,6 +55,20 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { std::unordered_map> m_affected_uni2exp_map; } loopPortsAdjuster; + + class BrgemmExternalRepackingAdjuster { + public: + BrgemmExternalRepackingAdjuster() = default; + BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator); + + void optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir, + const std::vector& shapes, + const std::vector>& layouts); + + private: + CPURuntimeConfigurator* m_configurator = nullptr; + std::set m_param_idces_with_external_repacking; + } externalRepackingAdjuster; }; } // namespace intel_cpu From e885a38c708b49498afab43af4f02ca574b510f5 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 12 Nov 2024 22:18:09 +0100 Subject: [PATCH 18/42] [WIP] Move adjuster to a separate file --- .../include/snippets/runtime_configurator.hpp | 25 ++++-- .../snippets/src/runtime_configurator.cpp | 6 +- .../snippets/cpu_runtime_configurator.cpp | 66 +-------------- .../snippets/cpu_runtime_configurator.hpp | 21 +---- .../snippets/external_repacking_adjuster.cpp | 83 +++++++++++++++++++ .../snippets/external_repacking_adjuster.hpp | 33 ++++++++ 6 files changed, 141 insertions(+), 93 deletions(-) create mode 100644 src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp create mode 100644 src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 55679bc0745530..97a5aa7f90bfa2 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -44,12 +44,15 @@ class RuntimeConfig { size_t tensor_rank = 0; size_t tile_rank = 0; + std::vector input_shapes = {}; + std::vector input_layouts = {}; std::vector io_data_offsets = {}; ov::snippets::VectorDims master_shape = {}; size_t buffer_scratchpad_size = 0; std::vector buffer_cluster_offsets {}; KernelExecutorTablePtr kernel_executor_table = std::make_shared(); + std::vector m_latest_shapes = {}; }; /** @@ -83,6 +86,20 @@ class RuntimeConfigurator { */ void reset_kernel_executor_table() const; + // Getters for private members + std::shared_ptr get_config() const { return m_config; } + size_t get_io_num() const { return m_io_num; } + size_t get_in_num() const { return m_in_num; } + const std::vector& get_io_descs() const { return m_io_descs; } + const std::vector& get_io_data_sizes() const { return m_io_data_sizes; } + const std::map>& get_dynamic_buffer_clusters() const { return m_dynamic_buffer_clusters; } + + static void compute_offsets(const ov::snippets::VectorDims& shape, + ov::snippets::VectorDims& offsets, + size_t offsets_size, + size_t dim_step, + size_t idx_stride); + protected: /** * @brief Update RuntimeConfig based on LinearIR @@ -157,12 +174,6 @@ class RuntimeConfigurator { */ std::vector> extract_layouts() const; - static void compute_offsets(const ov::snippets::VectorDims& shape, - ov::snippets::VectorDims& offsets, - size_t offsets_size, - size_t dim_step, - size_t idx_stride); - class MHAParallelWAOptimizer { public: MHAParallelWAOptimizer() = default; @@ -208,8 +219,6 @@ class RuntimeConfigurator { std::vector m_io_data_sizes = {}; // [cluster_id -> buffer expressions ] std::map> m_dynamic_buffer_clusters = {}; - - std::vector m_latest_shapes = {}; }; } // namespace snippets diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 54581e078aaa1e..2ab4b1c1cff059 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -60,7 +60,7 @@ void RuntimeConfigurator::initialization(const lowered::LinearIRCPtr& linear_ir) init_buffer_info(linear_ir); OPENVINO_ASSERT(m_io_num > 0, "LinearIR must have parameters and results"); - m_latest_shapes.resize(m_io_num); + m_config->m_latest_shapes.resize(m_io_num); m_config->io_data_offsets.resize(m_io_num); m_config->tile_rank = linear_ir->get_config().m_loop_depth; m_optimizer = MHAParallelWAOptimizer(linear_ir, this); @@ -80,7 +80,7 @@ void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { // because `ComputeAllocationSize` depends on subtensors which are updated in the table get_kernel_executor_table()->update_state(linear_ir); update_buffer_scratchpad_size(linear_ir); - m_latest_shapes = std::move(shapes); + m_config->m_latest_shapes = std::move(shapes); } void RuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { @@ -272,7 +272,7 @@ void RuntimeConfigurator::update_data_offsets(const std::vector& sha // offsets: s1*s3, s3, 0, 1 const auto& shape = shapes[i]; OPENVINO_ASSERT(m_config->tensor_rank >= shape.size(), "Incorrect tensor rank!"); - if (shape == m_latest_shapes[i]) + if (shape == m_config->m_latest_shapes[i]) continue; if (utils::is_dynamic_vdims(shape)) return; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 17662960793c95..d78d011b6c4db3 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -75,7 +75,7 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l } update_data_offsets(shapes, layouts); externalRepackingAdjuster.optimize(linear_ir, shapes, layouts); - m_latest_shapes = std::move(shapes); + m_config->m_latest_shapes = std::move(shapes); } void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { @@ -145,69 +145,5 @@ void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() { } } #endif - -#ifdef OPENVINO_ARCH_ARM64 -CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { -} - -void CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::optimize() { -} -#else -CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - CPURuntimeConfigurator* configurator) : m_configurator(configurator) { - const auto& params = linear_ir->get_parameters(); - for (size_t i = 0; i < params.size(); ++i) { - const auto& param = params[i]; - const auto consumers = param->get_output_port_connector(0)->get_consumers(); - const bool brgemm_with_extracted_repacking = - std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) { - auto brgemm = ov::as_type_ptr(port.get_expr()->get_node()); - return port.get_index() == 1 && brgemm && brgemm_utils::with_repacking(brgemm->get_type()); - }); - if (brgemm_with_extracted_repacking) { - m_param_idces_with_external_repacking.insert(i); - } - } -} - -void CPURuntimeConfigurator::BrgemmExternalRepackingAdjuster::optimize( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::vector& shapes, - const std::vector>& layouts) { - const auto& cpu_config = ov::as_type_ptr(m_configurator->m_config); - auto& optimal_descs = cpu_config->m_in_requested_descs; - for (const auto& i : m_param_idces_with_external_repacking) { - const auto& shape = shapes[i]; - // TODO: support orbitrary order - const auto& K = *++shape.rbegin(); - const auto& N = *shape.rbegin(); - - const auto& precision = linear_ir->get_parameters()[i]->get_node()->get_output_element_type(0); - const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); - // Firstly, batch dims are set - VectorDims requested_blocked_shape(shape.begin(), shape.end() - m_configurator->m_config->tile_rank); - // Then, the blocked dims are formed - requested_blocked_shape.insert( - requested_blocked_shape.end(), - {snippets::utils::div_up(K, vnni_factor), std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)), vnni_factor}); - - VectorDims requested_order(shape.size() - m_configurator->m_config->tile_rank); - std::iota(requested_order.begin(), requested_order.end(), 0); - const auto last_idx = shape.size() - 1; - requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); - - optimal_descs[i] = std::make_shared(precision, Shape(shape), requested_blocked_shape, requested_order); - - ov::snippets::VectorDims shape_for_offset(m_configurator->m_config->tensor_rank - shape.size(), 1); - shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end()); - auto& offsets = m_configurator->m_config->io_data_offsets[i]; - compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_configurator->m_io_data_sizes[i], 0); - // TODO: Support non-planar layout - OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(layouts[i])); - } -} -#endif - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 616cffc57bc58a..a77e055bd6497a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -4,12 +4,11 @@ #pragma once -#include "snippets/runtime_configurator.hpp" - -#include "snippets/lowered/port_descriptor.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" - +#include "external_repacking_adjuster.hpp" #include "memory_desc/cpu_blocked_memory_desc.h" +#include "snippets/lowered/port_descriptor.hpp" +#include "snippets/runtime_configurator.hpp" namespace ov { namespace intel_cpu { @@ -56,19 +55,7 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { std::vector> m_affected_uni2exp_map; } loopPortsAdjuster; - class BrgemmExternalRepackingAdjuster { - public: - BrgemmExternalRepackingAdjuster() = default; - BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator); - - void optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::vector& shapes, - const std::vector>& layouts); - - private: - CPURuntimeConfigurator* m_configurator = nullptr; - std::set m_param_idces_with_external_repacking; - } externalRepackingAdjuster; + BrgemmExternalRepackingAdjuster externalRepackingAdjuster; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp new file mode 100644 index 00000000000000..a600fd5d17c52d --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp @@ -0,0 +1,83 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "emitters/snippets/cpu_runtime_configurator.hpp" + +#include "memory_desc/cpu_blocked_memory_desc.h" +#include "memory_desc/dnnl_blocked_memory_desc.h" +#include "snippets/utils/utils.hpp" + +#ifndef OPENVINO_ARCH_ARM64 +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" +#endif + +namespace ov { +namespace intel_cpu { + +#ifdef OPENVINO_ARCH_ARM64 +BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { +} + +void BrgemmExternalRepackingAdjuster::optimize() { +} +#else +BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster( + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + CPURuntimeConfigurator* configurator) : m_configurator(configurator) { + const auto& params = linear_ir->get_parameters(); + for (size_t i = 0; i < params.size(); ++i) { + const auto& param = params[i]; + const auto consumers = param->get_output_port_connector(0)->get_consumers(); + const bool brgemm_with_extracted_repacking = + std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) { + auto brgemm = ov::as_type_ptr(port.get_expr()->get_node()); + return port.get_index() == 1 && brgemm && brgemm_utils::with_repacking(brgemm->get_type()); + }); + if (brgemm_with_extracted_repacking) { + m_param_idces_with_external_repacking.insert(i); + } + } +} + +void BrgemmExternalRepackingAdjuster::optimize( + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + const std::vector& shapes, + const std::vector>& layouts) { + const auto& cpu_config = ov::as_type_ptr(m_configurator->get_config()); + auto& optimal_descs = cpu_config->m_in_requested_descs; + for (const auto& i : m_param_idces_with_external_repacking) { + const auto& shape = shapes[i]; + // TODO: support orbitrary order + const auto& K = *++shape.rbegin(); + const auto& N = *shape.rbegin(); + + const auto& precision = linear_ir->get_parameters()[i]->get_node()->get_output_element_type(0); + const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); + // Firstly, batch dims are set + VectorDims requested_blocked_shape(shape.begin(), shape.end() - cpu_config->tile_rank); + // Then, the blocked dims are formed + requested_blocked_shape.insert( + requested_blocked_shape.end(), + {snippets::utils::div_up(K, vnni_factor), std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)), vnni_factor}); + + VectorDims requested_order(shape.size() - cpu_config->tile_rank); + std::iota(requested_order.begin(), requested_order.end(), 0); + const auto last_idx = shape.size() - 1; + requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); + + optimal_descs[i] = std::make_shared(precision, Shape(shape), requested_blocked_shape, requested_order); + + ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1); + shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end()); + auto& offsets = cpu_config->io_data_offsets[i]; + snippets::RuntimeConfigurator::compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_configurator->get_io_data_sizes()[i], 0); + // TODO: Support non-planar layout + OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(layouts[i])); + } +} +#endif + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp new file mode 100644 index 00000000000000..844024ead37990 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp @@ -0,0 +1,33 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/runtime_configurator.hpp" + +#include "snippets/lowered/port_descriptor.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" + +#include "memory_desc/cpu_blocked_memory_desc.h" + +namespace ov { +namespace intel_cpu { + +class CPURuntimeConfigurator; +class BrgemmExternalRepackingAdjuster { +public: + BrgemmExternalRepackingAdjuster() = default; + BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator); + + void optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir, + const std::vector& shapes, + const std::vector>& layouts); + +private: + CPURuntimeConfigurator* m_configurator = nullptr; + std::set m_param_idces_with_external_repacking; +}; + +} // namespace intel_cpu +} // namespace ov From aef1ecb0f3093927fdee77cd38b4805572df550b Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 12 Nov 2024 22:40:24 +0100 Subject: [PATCH 19/42] [WIP] Use shapes from config in optimizers --- .../include/snippets/runtime_configurator.hpp | 9 +++--- .../snippets/src/runtime_configurator.cpp | 31 ++++++++++--------- .../snippets/cpu_runtime_configurator.cpp | 12 +++---- .../snippets/external_repacking_adjuster.cpp | 9 ++---- .../snippets/external_repacking_adjuster.hpp | 4 +-- 5 files changed, 30 insertions(+), 35 deletions(-) diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 97a5aa7f90bfa2..ebb7477346a6ef 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -44,8 +44,8 @@ class RuntimeConfig { size_t tensor_rank = 0; size_t tile_rank = 0; - std::vector input_shapes = {}; - std::vector input_layouts = {}; + std::vector shapes = {}; + std::vector layouts = {}; std::vector io_data_offsets = {}; ov::snippets::VectorDims master_shape = {}; @@ -163,8 +163,7 @@ class RuntimeConfigurator { * @param shapes shapes used in offsets computation * @param layouts layouts used in offsets computation */ - void update_data_offsets(const std::vector& shapes, - const std::vector>& layouts) const; + void update_data_offsets() const; /** * @brief Extract shapes from m_io_descs */ @@ -182,7 +181,7 @@ class RuntimeConfigurator { * @brief Checks if the current master shape can be optimized, and if yes, updates all the necessary runtime information * @return status if the optimization is applied */ - bool optimize(std::vector& shapes, std::vector>& layots); + bool optimize(); private: /** diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 2ab4b1c1cff059..1b31556da38ec8 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -68,19 +68,19 @@ void RuntimeConfigurator::initialization(const lowered::LinearIRCPtr& linear_ir) void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { m_config->master_shape = linear_ir->get_master_shape(); + m_config->shapes = extract_shapes(); + m_config->layouts = extract_layouts(); update_loop_info(linear_ir); - auto shapes = extract_shapes(); - auto layouts = extract_layouts(); - m_optimizer.optimize(shapes, layouts); + m_optimizer.optimize(); - update_data_offsets(shapes, layouts); + update_data_offsets(); // Update KernelExecutor Table should be before `update_buffer_scratchpad_size` // because `ComputeAllocationSize` depends on subtensors which are updated in the table get_kernel_executor_table()->update_state(linear_ir); update_buffer_scratchpad_size(linear_ir); - m_config->m_latest_shapes = std::move(shapes); + m_config->m_latest_shapes = std::move(m_config->shapes); } void RuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { @@ -257,8 +257,9 @@ void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRC OPENVINO_ASSERT(!utils::is_dynamic_value(m_config->buffer_scratchpad_size), "Buffer scratchpad size must be defined!"); } -void RuntimeConfigurator::update_data_offsets(const std::vector& shapes, - const std::vector>& layouts) const { +void RuntimeConfigurator::update_data_offsets() const { + const auto& shapes = m_config->shapes; + const auto& layouts = m_config->layouts; OPENVINO_ASSERT(shapes.size() == m_io_num, "Number of custom shapes must be 0 or be equal to m_io_num"); OPENVINO_ASSERT(layouts.size() == m_io_num, "Number of custom layouts must be 0 or be equal to m_io_num"); for (size_t i = 0; i < m_io_num; ++i) { @@ -382,16 +383,16 @@ bool RuntimeConfigurator::MHAParallelWAOptimizer::enabled() const { return !loops_to_split.empty(); } -bool RuntimeConfigurator::MHAParallelWAOptimizer::optimize(std::vector& shapes, - std::vector>& layouts) { +bool RuntimeConfigurator::MHAParallelWAOptimizer::optimize() { OPENVINO_ASSERT(configurator != nullptr, "Configurator is nullptr"); if (!enabled()) return false; + const auto& config = configurator->get_config(); size_t new_batch_dim, new_kernel_dim; - if (!SplitDimensionM::split(configurator->m_config->master_shape, concurrency, new_batch_dim, new_kernel_dim)) + if (!SplitDimensionM::split(config->master_shape, concurrency, new_batch_dim, new_kernel_dim)) return false; - auto& master_shape = configurator->m_config->master_shape; + auto& master_shape = config->master_shape; *++master_shape.rbegin() = new_kernel_dim; master_shape.insert(master_shape.cbegin() + master_shape.size() - 2, new_batch_dim); configurator->update_tensor_rank(master_shape); @@ -417,11 +418,11 @@ bool RuntimeConfigurator::MHAParallelWAOptimizer::optimize(std::vectorm_io_num; ++i) { - shapes[i] = unsqueezed_params.count(i) - ? SplitDimensionM::unsqueeze_m_dim(shapes[i], m_dim_idces[i]) - : SplitDimensionM::reshape_m_dim(shapes[i], m_dim_idces[i], new_batch_dim, new_kernel_dim); + config->shapes[i] = unsqueezed_params.count(i) + ? SplitDimensionM::unsqueeze_m_dim(config->shapes[i], m_dim_idces[i]) + : SplitDimensionM::reshape_m_dim(config->shapes[i], m_dim_idces[i], new_batch_dim, new_kernel_dim); } - layouts = optimized_layouts; + config->layouts = optimized_layouts; return true; } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index d78d011b6c4db3..e46089dcc283ad 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -54,13 +54,13 @@ void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearI void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { m_config->master_shape = linear_ir->get_master_shape(); + m_config->shapes = extract_shapes(); + m_config->layouts = extract_layouts(); if (linear_ir->is_dynamic()) { update_loop_info(linear_ir); } - auto shapes = extract_shapes(); - auto layouts = extract_layouts(); - m_optimizer.optimize(shapes, layouts); + m_optimizer.optimize(); if (linear_ir->is_dynamic()) loopPortsAdjuster.optimize(); @@ -73,9 +73,9 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l if (linear_ir->is_dynamic()) { update_loop_args(linear_ir); } - update_data_offsets(shapes, layouts); - externalRepackingAdjuster.optimize(linear_ir, shapes, layouts); - m_config->m_latest_shapes = std::move(shapes); + update_data_offsets(); + externalRepackingAdjuster.optimize(linear_ir); + m_config->m_latest_shapes = std::move(m_config->shapes); } void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp index a600fd5d17c52d..18b28e972cd35a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp @@ -41,14 +41,11 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster( } } -void BrgemmExternalRepackingAdjuster::optimize( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::vector& shapes, - const std::vector>& layouts) { +void BrgemmExternalRepackingAdjuster::optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { const auto& cpu_config = ov::as_type_ptr(m_configurator->get_config()); auto& optimal_descs = cpu_config->m_in_requested_descs; for (const auto& i : m_param_idces_with_external_repacking) { - const auto& shape = shapes[i]; + const auto& shape = m_configurator->get_config()->shapes[i]; // TODO: support orbitrary order const auto& K = *++shape.rbegin(); const auto& N = *shape.rbegin(); @@ -74,7 +71,7 @@ void BrgemmExternalRepackingAdjuster::optimize( auto& offsets = cpu_config->io_data_offsets[i]; snippets::RuntimeConfigurator::compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_configurator->get_io_data_sizes()[i], 0); // TODO: Support non-planar layout - OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(layouts[i])); + OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(m_configurator->get_config()->layouts[i])); } } #endif diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp index 844024ead37990..bf325c15b54879 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp @@ -20,9 +20,7 @@ class BrgemmExternalRepackingAdjuster { BrgemmExternalRepackingAdjuster() = default; BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator); - void optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::vector& shapes, - const std::vector>& layouts); + void optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir); private: CPURuntimeConfigurator* m_configurator = nullptr; From cdc963644eb54508a0aa36022e005223c735d620 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 12 Nov 2024 22:54:28 +0100 Subject: [PATCH 20/42] [WIP] introduced RuntimeOptimizer base class --- .../include/snippets/runtime_optimizer.hpp | 25 +++++++++++++++++++ .../snippets/external_repacking_adjuster.cpp | 14 +++++++---- .../snippets/external_repacking_adjuster.hpp | 8 +++--- 3 files changed, 38 insertions(+), 9 deletions(-) create mode 100644 src/common/snippets/include/snippets/runtime_optimizer.hpp diff --git a/src/common/snippets/include/snippets/runtime_optimizer.hpp b/src/common/snippets/include/snippets/runtime_optimizer.hpp new file mode 100644 index 00000000000000..fca039cd52c7fe --- /dev/null +++ b/src/common/snippets/include/snippets/runtime_optimizer.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/pass/pass.hpp" +#include "snippets/runtime_configurator.hpp" + +namespace ov { +namespace snippets { +// TODO: inherit from lowered pass? +class RuntimeOptimizer { +public: + RuntimeOptimizer() = default; + RuntimeOptimizer(RuntimeConfigurator* configurator) : m_configurator(configurator) {} + + virtual bool optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir) = 0; + +protected: + RuntimeConfigurator* m_configurator = nullptr; +}; +} // namespace snippets +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp index 18b28e972cd35a..222823f87a733c 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp @@ -17,15 +17,15 @@ namespace ov { namespace intel_cpu { #ifdef OPENVINO_ARCH_ARM64 -BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { -} +BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, + snippets::RuntimeConfigurator* configurator) {} -void BrgemmExternalRepackingAdjuster::optimize() { +bool BrgemmExternalRepackingAdjuster::optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { } #else BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster( const ov::snippets::lowered::LinearIRCPtr& linear_ir, - CPURuntimeConfigurator* configurator) : m_configurator(configurator) { + snippets::RuntimeConfigurator* configurator) : snippets::RuntimeOptimizer(configurator) { const auto& params = linear_ir->get_parameters(); for (size_t i = 0; i < params.size(); ++i) { const auto& param = params[i]; @@ -41,7 +41,10 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster( } } -void BrgemmExternalRepackingAdjuster::optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { +bool BrgemmExternalRepackingAdjuster::optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { + if (m_param_idces_with_external_repacking.empty()) + return false; + const auto& cpu_config = ov::as_type_ptr(m_configurator->get_config()); auto& optimal_descs = cpu_config->m_in_requested_descs; for (const auto& i : m_param_idces_with_external_repacking) { @@ -73,6 +76,7 @@ void BrgemmExternalRepackingAdjuster::optimize(const ov::snippets::lowered::Line // TODO: Support non-planar layout OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(m_configurator->get_config()->layouts[i])); } + return true; } #endif diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp index bf325c15b54879..a09245f94b5b54 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp @@ -5,6 +5,7 @@ #pragma once #include "snippets/runtime_configurator.hpp" +#include "snippets/runtime_optimizer.hpp" #include "snippets/lowered/port_descriptor.hpp" #include "emitters/snippets/jit_snippets_call_args.hpp" @@ -15,15 +16,14 @@ namespace ov { namespace intel_cpu { class CPURuntimeConfigurator; -class BrgemmExternalRepackingAdjuster { +class BrgemmExternalRepackingAdjuster : public ov::snippets::RuntimeOptimizer { public: BrgemmExternalRepackingAdjuster() = default; - BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator); + BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, snippets::RuntimeConfigurator* configurator); - void optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir); + bool optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; private: - CPURuntimeConfigurator* m_configurator = nullptr; std::set m_param_idces_with_external_repacking; }; From 710d64e8dbd2dd8e554754c1636d7d02b864ef67 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 12 Nov 2024 23:10:36 +0100 Subject: [PATCH 21/42] [WIP] RuntimeOptimizer inherited from lowered pass --- .../snippets/include/snippets/runtime_optimizer.hpp | 8 +++++--- .../src/emitters/snippets/cpu_runtime_configurator.cpp | 2 +- .../src/emitters/snippets/external_repacking_adjuster.cpp | 7 ++++--- .../src/emitters/snippets/external_repacking_adjuster.hpp | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/common/snippets/include/snippets/runtime_optimizer.hpp b/src/common/snippets/include/snippets/runtime_optimizer.hpp index fca039cd52c7fe..be1301bb9d34b7 100644 --- a/src/common/snippets/include/snippets/runtime_optimizer.hpp +++ b/src/common/snippets/include/snippets/runtime_optimizer.hpp @@ -10,13 +10,15 @@ namespace ov { namespace snippets { -// TODO: inherit from lowered pass? -class RuntimeOptimizer { +class RuntimeOptimizer : public lowered::pass::Pass { public: RuntimeOptimizer() = default; RuntimeOptimizer(RuntimeConfigurator* configurator) : m_configurator(configurator) {} - virtual bool optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir) = 0; + virtual bool run(const snippets::lowered::LinearIR& linear_ir) = 0; + bool run(snippets::lowered::LinearIR& linear_ir) override final { // NOLINT + return run(const_cast(linear_ir)); + } protected: RuntimeConfigurator* m_configurator = nullptr; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index e46089dcc283ad..d1f9c5dea33a34 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -74,7 +74,7 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l update_loop_args(linear_ir); } update_data_offsets(); - externalRepackingAdjuster.optimize(linear_ir); + externalRepackingAdjuster.run(*linear_ir); m_config->m_latest_shapes = std::move(m_config->shapes); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp index 222823f87a733c..f380331ce8cb8f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp @@ -20,7 +20,8 @@ namespace intel_cpu { BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, snippets::RuntimeConfigurator* configurator) {} -bool BrgemmExternalRepackingAdjuster::optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { +bool BrgemmExternalRepackingAdjuster::run(ov::snippets::lowered::LinearIR& linear_ir) { + return false; } #else BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster( @@ -41,7 +42,7 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster( } } -bool BrgemmExternalRepackingAdjuster::optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { +bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { if (m_param_idces_with_external_repacking.empty()) return false; @@ -53,7 +54,7 @@ bool BrgemmExternalRepackingAdjuster::optimize(const ov::snippets::lowered::Line const auto& K = *++shape.rbegin(); const auto& N = *shape.rbegin(); - const auto& precision = linear_ir->get_parameters()[i]->get_node()->get_output_element_type(0); + const auto& precision = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0); const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); // Firstly, batch dims are set VectorDims requested_blocked_shape(shape.begin(), shape.end() - cpu_config->tile_rank); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp index a09245f94b5b54..35f2875b5fe15e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp @@ -21,7 +21,7 @@ class BrgemmExternalRepackingAdjuster : public ov::snippets::RuntimeOptimizer { BrgemmExternalRepackingAdjuster() = default; BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, snippets::RuntimeConfigurator* configurator); - bool optimize(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; + bool run(const snippets::lowered::LinearIR& linear_ir) override; private: std::set m_param_idces_with_external_repacking; From cf667d8775d0cf32ea1ef90aeb2595dfc503c9d0 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 13 Nov 2024 11:09:02 +0100 Subject: [PATCH 22/42] Introduced RuntimeOptimizersPipeline --- .../include/snippets/runtime_configurator.hpp | 5 ++++- .../snippets/include/snippets/runtime_optimizer.hpp | 13 ++++++++++++- .../emitters/snippets/cpu_runtime_configurator.cpp | 4 ++-- .../snippets/external_repacking_adjuster.cpp | 2 +- .../snippets/external_repacking_adjuster.hpp | 2 +- 5 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index ebb7477346a6ef..891ff65c5bcfc1 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -4,9 +4,10 @@ #pragma once +#include "runtime_optimizer.hpp" +#include "snippets/kernel_executor_table.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_info.hpp" -#include "snippets/kernel_executor_table.hpp" #include "snippets/lowered/pass/pass.hpp" namespace ov { @@ -218,6 +219,8 @@ class RuntimeConfigurator { std::vector m_io_data_sizes = {}; // [cluster_id -> buffer expressions ] std::map> m_dynamic_buffer_clusters = {}; + + ov::snippets::lowered::pass::RuntimeOptimizersPipeline m_runtime_optimizers; }; } // namespace snippets diff --git a/src/common/snippets/include/snippets/runtime_optimizer.hpp b/src/common/snippets/include/snippets/runtime_optimizer.hpp index be1301bb9d34b7..f4407f1d9a7ef0 100644 --- a/src/common/snippets/include/snippets/runtime_optimizer.hpp +++ b/src/common/snippets/include/snippets/runtime_optimizer.hpp @@ -10,7 +10,9 @@ namespace ov { namespace snippets { -class RuntimeOptimizer : public lowered::pass::Pass { +namespace lowered { +namespace pass { +class RuntimeOptimizer : public Pass { public: RuntimeOptimizer() = default; RuntimeOptimizer(RuntimeConfigurator* configurator) : m_configurator(configurator) {} @@ -23,5 +25,14 @@ class RuntimeOptimizer : public lowered::pass::Pass { protected: RuntimeConfigurator* m_configurator = nullptr; }; + +class RuntimeOptimizersPipeline : public PassPipeline { +public: + void run(const lowered::LinearIR& linear_ir) const { + PassPipeline::run(const_cast(linear_ir)); + } +}; +} // namespace pass +} // namespace lowered } // namespace snippets } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index d1f9c5dea33a34..ebc2a8ecc1f6ab 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -49,7 +49,7 @@ void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearI if (linear_ir->is_dynamic()) { loopPortsAdjuster = BrgemmCopyBLoopPortsAdjuster(linear_ir); } - externalRepackingAdjuster = BrgemmExternalRepackingAdjuster(linear_ir, this); + m_runtime_optimizers.register_pass(linear_ir, this); } void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { @@ -74,7 +74,7 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l update_loop_args(linear_ir); } update_data_offsets(); - externalRepackingAdjuster.run(*linear_ir); + m_runtime_optimizers.run(*linear_ir); m_config->m_latest_shapes = std::move(m_config->shapes); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp index f380331ce8cb8f..fdd71e8b962730 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp @@ -26,7 +26,7 @@ bool BrgemmExternalRepackingAdjuster::run(ov::snippets::lowered::LinearIR& linea #else BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster( const ov::snippets::lowered::LinearIRCPtr& linear_ir, - snippets::RuntimeConfigurator* configurator) : snippets::RuntimeOptimizer(configurator) { + snippets::RuntimeConfigurator* configurator) : snippets::lowered::pass::RuntimeOptimizer(configurator) { const auto& params = linear_ir->get_parameters(); for (size_t i = 0; i < params.size(); ++i) { const auto& param = params[i]; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp index 35f2875b5fe15e..451bd6b85fa08a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp @@ -16,7 +16,7 @@ namespace ov { namespace intel_cpu { class CPURuntimeConfigurator; -class BrgemmExternalRepackingAdjuster : public ov::snippets::RuntimeOptimizer { +class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::RuntimeOptimizer { public: BrgemmExternalRepackingAdjuster() = default; BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, snippets::RuntimeConfigurator* configurator); From c04366fb1f80855860871e2a43c51168f2e889a4 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 13 Nov 2024 13:53:42 +0100 Subject: [PATCH 23/42] All optimizers are rewritten to RuntimeOptimizers --- .../include/snippets/lowered/pass/pass.hpp | 16 ++ .../snippets/mha_parallel_wa_optimizer.hpp | 44 +++++ .../include/snippets/runtime_configurator.hpp | 102 +++++----- .../include/snippets/runtime_optimizer.hpp | 14 +- src/common/snippets/src/lowered/pass/pass.cpp | 14 ++ .../src/mha_parallel_wa_optimizer.cpp | 176 +++++++++++++++++ .../snippets/src/runtime_configurator.cpp | 186 +----------------- .../brgemm_copy_b_loop_ports_adjuster.cpp | 44 +++++ .../brgemm_copy_b_loop_ports_adjuster.hpp | 28 +++ .../snippets/cpu_runtime_configurator.cpp | 56 +----- .../snippets/cpu_runtime_configurator.hpp | 27 +-- .../snippets/external_repacking_adjuster.cpp | 14 +- 12 files changed, 394 insertions(+), 327 deletions(-) create mode 100644 src/common/snippets/include/snippets/mha_parallel_wa_optimizer.hpp create mode 100644 src/common/snippets/src/mha_parallel_wa_optimizer.cpp create mode 100644 src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.cpp create mode 100644 src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp diff --git a/src/common/snippets/include/snippets/lowered/pass/pass.hpp b/src/common/snippets/include/snippets/lowered/pass/pass.hpp index 446f96d30a27cf..2758ab85070341 100644 --- a/src/common/snippets/include/snippets/lowered/pass/pass.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/pass.hpp @@ -67,6 +67,21 @@ class Pass : public PassBase { virtual bool run(lowered::LinearIR& linear_ir) = 0; }; +/** + * @interface ConstPass + * @brief Base class for LIR passes which are performed on a full LIR body but doesn't change it + * @ingroup snippets + */ +class ConstPass : public PassBase { +public: + /** + * @brief Apply the pass to the Linear IR + * @param linear_ir the target Linear IR + * @return status of the pass + */ + virtual bool run(const lowered::LinearIR& linear_ir) = 0; +}; + /** * @interface RangedPass * @brief Base class for LIR passes which are performed on a range of a LIR body @@ -114,6 +129,7 @@ class PassPipeline { void register_positioned_passes(const std::vector& pos_passes); void run(lowered::LinearIR& linear_ir) const; + void run(const lowered::LinearIR& linear_ir) const; void run(lowered::LinearIR& linear_ir, lowered::LinearIR::constExprIt begin, lowered::LinearIR::constExprIt end) const; /** diff --git a/src/common/snippets/include/snippets/mha_parallel_wa_optimizer.hpp b/src/common/snippets/include/snippets/mha_parallel_wa_optimizer.hpp new file mode 100644 index 00000000000000..97d0cfce709095 --- /dev/null +++ b/src/common/snippets/include/snippets/mha_parallel_wa_optimizer.hpp @@ -0,0 +1,44 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "runtime_optimizer.hpp" +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_info.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { + +class MHAParallelWAOptimizer : public lowered::pass::RuntimeOptimizer { +public: + MHAParallelWAOptimizer() = default; + MHAParallelWAOptimizer(const lowered::LinearIRCPtr& linear_ir, RuntimeConfigurator* configurator); + + bool run(const lowered::LinearIR& linear_ir) override; + +private: + static std::unordered_set find_applicable_brgemms(const lowered::LinearIRCPtr& linear_ir); + static std::unordered_set find_unsqueezed_params( + const lowered::LinearIRCPtr& linear_ir, + const std::unordered_set& brgemms); + static std::vector find_loops_to_split( + const lowered::LinearIRCPtr& linear_ir, + const std::unordered_set& unsqueezed_params); + + std::vector loops_to_split{}; + std::unordered_set unsqueezed_params{}; + std::vector> optimized_layouts{}; + std::vector m_dim_idces{}; + size_t concurrency = 0; + + static const size_t m_dim_idx; +}; + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 891ff65c5bcfc1..0550be144d41ca 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -4,11 +4,11 @@ #pragma once -#include "runtime_optimizer.hpp" #include "snippets/kernel_executor_table.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_info.hpp" #include "snippets/lowered/pass/pass.hpp" +#include "snippets/runtime_optimizer.hpp" namespace ov { namespace snippets { @@ -65,6 +65,22 @@ class RuntimeConfigurator { RuntimeConfigurator(std::shared_ptr c); virtual ~RuntimeConfigurator() = default; + // Note that get_type_info_static and get_type_info are needed to mimic OPENVINO_RTTI interface, + // so the standard OPENVINO_RTTI(...) macros could be used in derived classes. + _OPENVINO_HIDDEN_METHOD static const ::ov::DiscreteTypeInfo& get_type_info_static() { + static ::ov::DiscreteTypeInfo type_info_static {"RuntimeConfigurator"}; + type_info_static.hash(); + return type_info_static; + } + + virtual const DiscreteTypeInfo& get_type_info() const { + return get_type_info_static(); + } + + const char* get_type_name() const { + return get_type_info().name; + } + /** * @brief Update RuntimeConfig based on new state of LinearIR and return its * @param linear_ir LinearIR @@ -101,18 +117,33 @@ class RuntimeConfigurator { size_t dim_step, size_t idx_stride); -protected: + struct UnifiedLoopInfoRtParams { + size_t work_amount = 0; + std::vector ptr_increments; + std::vector finalization_offsets; + }; + static UnifiedLoopInfoRtParams get_loop_runtime_params(const lowered::UnifiedLoopInfoPtr& unified_loop_info); + using LoopInfoRuntimeParamsMap = std::unordered_map; /** - * @brief Update RuntimeConfig based on LinearIR + * @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo * @param linear_ir LinearIR - * @todo Ticket 148891: Rewrite on PassPipeline */ - virtual void update(const lowered::LinearIRCPtr& linear_ir); + static void update_loop_info(const lowered::LinearIRCPtr& linear_ir); + static void update_expanded_loop_info(const lowered::ExpandedLoopInfoPtr& expanded_loop_info, + LoopInfoRuntimeParamsMap& initializated_info_map); /** * @brief Update tensor rank based on master shape * @param master_shape Master shape */ virtual void update_tensor_rank(const ov::snippets::VectorDims& master_shape); + +protected: + /** + * @brief Update RuntimeConfig based on LinearIR + * @param linear_ir LinearIR + * @todo Ticket 148891: Rewrite on PassPipeline + */ + virtual void update(const lowered::LinearIRCPtr& linear_ir); /** * @brief Allocate and intialize fields in RuntimeConfig and RuntimeConfigurator * @param linear_ir LinearIR @@ -138,21 +169,6 @@ class RuntimeConfigurator { * @param linear_ir LinearIR */ virtual void init_tensor_rank(const lowered::LinearIRCPtr& linear_ir) const; - - struct UnifiedLoopInfoRtParams { - size_t work_amount = 0; - std::vector ptr_increments; - std::vector finalization_offsets; - }; - static UnifiedLoopInfoRtParams get_loop_runtime_params(const lowered::UnifiedLoopInfoPtr& unified_loop_info); - using LoopInfoRuntimeParamsMap = std::unordered_map; - /** - * @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo - * @param linear_ir LinearIR - */ - static void update_loop_info(const lowered::LinearIRCPtr& linear_ir); - static void update_expanded_loop_info(const lowered::ExpandedLoopInfoPtr& expanded_loop_info, - LoopInfoRuntimeParamsMap& initializated_info_map); /** * @brief Update Buffer scratchpad size and offsets if needed * Note: `update_loop_info` must be called before @@ -174,43 +190,6 @@ class RuntimeConfigurator { */ std::vector> extract_layouts() const; - class MHAParallelWAOptimizer { - public: - MHAParallelWAOptimizer() = default; - MHAParallelWAOptimizer(const ov::snippets::lowered::LinearIRCPtr& linear_ir, RuntimeConfigurator* configurator); - /** - * @brief Checks if the current master shape can be optimized, and if yes, updates all the necessary runtime information - * @return status if the optimization is applied - */ - bool optimize(); - - private: - /** - * @brief Checks if optimizer is enabled - * @todo Ticket 148891: when RuntimeConfigurator::update will be rewritten on PassPipeline, this method should be removed - * We will not just register MHAParallelWAOptimizer in case if it is not needed - */ - bool enabled() const; - - static std::unordered_set find_applicable_brgemms(const ov::snippets::lowered::LinearIRCPtr& linear_ir); - static std::unordered_set find_unsqueezed_params( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::unordered_set& brgemms); - static std::vector find_loops_to_split( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::unordered_set& unsqueezed_params); - - RuntimeConfigurator* configurator = nullptr; - - std::vector loops_to_split{}; - std::unordered_set unsqueezed_params{}; - std::vector> optimized_layouts{}; - std::vector m_dim_idces{}; - size_t concurrency = 0; - - static const size_t m_dim_idx; - } m_optimizer; - std::shared_ptr m_config = nullptr; size_t m_io_num = 0; @@ -220,7 +199,14 @@ class RuntimeConfigurator { // [cluster_id -> buffer expressions ] std::map> m_dynamic_buffer_clusters = {}; - ov::snippets::lowered::pass::RuntimeOptimizersPipeline m_runtime_optimizers; + // WA: until 148891 is not implemented, 2 pass pipelines for runtime optimizers are necessary since different + // optimizers must be called at different pipeline stages. + // - Intermediate optimizers must be called right after `update_loop_info` + // - Final optimizers must be called after all other RuntimeConfigurator's update methods + // When all updates will be rewritten on PassPipeline, PositionedPasses can be used to precisely define the place of + // the additional optimizers + lowered::pass::PassPipeline m_intermediate_runtime_optimizers; + lowered::pass::PassPipeline m_final_runtime_optimizers; }; } // namespace snippets diff --git a/src/common/snippets/include/snippets/runtime_optimizer.hpp b/src/common/snippets/include/snippets/runtime_optimizer.hpp index f4407f1d9a7ef0..1b52d89d7e8a07 100644 --- a/src/common/snippets/include/snippets/runtime_optimizer.hpp +++ b/src/common/snippets/include/snippets/runtime_optimizer.hpp @@ -12,26 +12,14 @@ namespace ov { namespace snippets { namespace lowered { namespace pass { -class RuntimeOptimizer : public Pass { +class RuntimeOptimizer : public ConstPass { public: RuntimeOptimizer() = default; RuntimeOptimizer(RuntimeConfigurator* configurator) : m_configurator(configurator) {} - - virtual bool run(const snippets::lowered::LinearIR& linear_ir) = 0; - bool run(snippets::lowered::LinearIR& linear_ir) override final { // NOLINT - return run(const_cast(linear_ir)); - } - protected: RuntimeConfigurator* m_configurator = nullptr; }; -class RuntimeOptimizersPipeline : public PassPipeline { -public: - void run(const lowered::LinearIR& linear_ir) const { - PassPipeline::run(const_cast(linear_ir)); - } -}; } // namespace pass } // namespace lowered } // namespace snippets diff --git a/src/common/snippets/src/lowered/pass/pass.cpp b/src/common/snippets/src/lowered/pass/pass.cpp index f5b902a1a17b8c..5dcd75572df8f5 100644 --- a/src/common/snippets/src/lowered/pass/pass.cpp +++ b/src/common/snippets/src/lowered/pass/pass.cpp @@ -27,6 +27,20 @@ void PassPipeline::register_pass(const std::shared_ptr& pass) { m_passes.push_back(pass); } +void PassPipeline::run(const lowered::LinearIR& linear_ir) const { + for (const auto& pass : m_passes) { + OPENVINO_ASSERT(pass != nullptr, "PassPipeline has empty pass!"); + // SNIPPETS_DEBUG_LIR_PASS_DUMP(linear_ir, pass); + + if (m_pass_config->is_disabled(pass->get_type_info())) { + continue; + } + const auto const_pass = std::dynamic_pointer_cast(pass); + OPENVINO_ASSERT(const_pass != nullptr, "Unexpected pass (", pass->get_type_info(), ") is registered in PassPipeline"); + const_pass->run(linear_ir); + } +} + void PassPipeline::run(LinearIR& linear_ir) const { run(linear_ir, linear_ir.cbegin(), linear_ir.cend()); } diff --git a/src/common/snippets/src/mha_parallel_wa_optimizer.cpp b/src/common/snippets/src/mha_parallel_wa_optimizer.cpp new file mode 100644 index 00000000000000..bb70af011c6c76 --- /dev/null +++ b/src/common/snippets/src/mha_parallel_wa_optimizer.cpp @@ -0,0 +1,176 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/mha_parallel_wa_optimizer.hpp" + +#include "snippets/pass/split_dimension_m.hpp" +#include "snippets/utils/utils.hpp" +#include "snippets/utils/loop_utils.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/lowered/loop_info.hpp" + +namespace ov { +namespace snippets { +namespace lowered { +namespace pass { +using namespace ov::snippets::pass; + +const size_t MHAParallelWAOptimizer::m_dim_idx = 1; + +MHAParallelWAOptimizer::MHAParallelWAOptimizer(const lowered::LinearIRCPtr& linear_ir, RuntimeConfigurator* configurator) + : lowered::pass::RuntimeOptimizer(configurator) { + if (linear_ir->get_config().m_enable_domain_optimization || !linear_ir->is_dynamic()) + return; + + const auto brgemms = find_applicable_brgemms(linear_ir); + if (brgemms.empty()) + return; + + concurrency = linear_ir->get_config().m_min_parallel_work_amount; + unsqueezed_params = find_unsqueezed_params(linear_ir, brgemms); + OPENVINO_ASSERT(!unsqueezed_params.empty(), "unsqueezed_params mustn't be empty after initialization"); + loops_to_split = find_loops_to_split(linear_ir, unsqueezed_params); + + m_dim_idces.resize(configurator->get_io_num()); + optimized_layouts.resize(configurator->get_io_num()); + for (size_t i = 0; i < configurator->get_io_num(); ++i) { + const auto& layout = configurator->get_io_descs()[i]->get_layout(); + const auto dim_idx = i < configurator->get_in_num() ? utils::get_input_dim_idx(layout, m_dim_idx) + : utils::get_output_dim_idx(layout, m_dim_idx); + m_dim_idces[i] = dim_idx; + const auto m_idx = i < configurator->get_in_num() ? dim_idx : layout.size() - 2; + optimized_layouts[i] = SplitDimensionM::get_updated_order(layout, m_idx); + } +} + +bool MHAParallelWAOptimizer::run(const lowered::LinearIR& linear_ir) { + if (loops_to_split.empty()) + return false; + + const auto& config = m_configurator->get_config(); + size_t new_batch_dim, new_kernel_dim; + if (!SplitDimensionM::split(config->master_shape, concurrency, new_batch_dim, new_kernel_dim)) + return false; + auto& master_shape = config->master_shape; + *++master_shape.rbegin() = new_kernel_dim; + master_shape.insert(master_shape.cbegin() + master_shape.size() - 2, new_batch_dim); + m_configurator->update_tensor_rank(master_shape); + + RuntimeConfigurator::LoopInfoRuntimeParamsMap initialized_info; + auto updater = [&](const lowered::LoopInfoPtr& loop_info) { + if (const auto unified_loop_info = ov::as_type_ptr(loop_info)) { + if (initialized_info.count(unified_loop_info) == 0) { + if (!ov::is_type(unified_loop_info)) + unified_loop_info->set_work_amount(new_kernel_dim); + snippets::utils::update_data_pointer_shifts(unified_loop_info); + initialized_info[unified_loop_info] = RuntimeConfigurator::get_loop_runtime_params(unified_loop_info); + } + } else if (const auto expanded_loop_info = ov::as_type_ptr(loop_info)) { + m_configurator->update_expanded_loop_info(expanded_loop_info, initialized_info); + } else { + OPENVINO_THROW("Failed to update loop info: unknown type!"); + } + }; + lowered::LoopInfoSet updated_loops; + for (const auto& loop : loops_to_split) { + loop->apply(updater, updated_loops); + } + + for (size_t i = 0; i < m_configurator->get_io_num(); ++i) { + config->shapes[i] = unsqueezed_params.count(i) + ? SplitDimensionM::unsqueeze_m_dim(config->shapes[i], m_dim_idces[i]) + : SplitDimensionM::reshape_m_dim(config->shapes[i], m_dim_idces[i], new_batch_dim, new_kernel_dim); + } + config->layouts = optimized_layouts; + return true; +} + +std::unordered_set MHAParallelWAOptimizer::find_applicable_brgemms(const lowered::LinearIRCPtr& linear_ir) { + auto is_brgemm = [](const lowered::ExpressionPtr& expr) { + return ov::is_type(expr->get_node()); + }; + auto brgemm_it = std::find_if(linear_ir->begin(), linear_ir->end(), is_brgemm); + std::unordered_set brgemms; + while (brgemm_it != linear_ir->end()) { + brgemms.insert(*brgemm_it); + brgemm_it = std::find_if(std::next(brgemm_it), linear_ir->end(), is_brgemm); + } + const auto& loop_manager = linear_ir->get_loop_manager(); + auto applicable_brgemm = [&loop_manager](const lowered::ExpressionPtr& expr) { + const auto& loop_idces = expr->get_loop_ids(); + if (loop_idces.empty()) + return false; + const auto& outermost_loop = loop_manager->get_loop_info(loop_idces[0]); + if (!snippets::utils::is_dynamic_value(outermost_loop->get_work_amount())) + return false; + bool loop_by_m = true; + outermost_loop->iterate_through_ports([&loop_by_m](const lowered::LoopPort& port) { + if (port.is_incremented && port.dim_idx != m_dim_idx) + loop_by_m = false; + }); + return loop_by_m; + }; + return std::all_of(brgemms.begin(), brgemms.end(), applicable_brgemm) ? brgemms : std::unordered_set{}; +} + +std::unordered_set MHAParallelWAOptimizer::find_unsqueezed_params( + const lowered::LinearIRCPtr& linear_ir, + const std::unordered_set& brgemms) { + const auto& params = linear_ir->get_parameters(); + std::unordered_set unsqueezed_params; + auto add_param = [¶ms, &unsqueezed_params](const lowered::ExpressionPtr& expr) { + if (ov::is_type(expr->get_node())) { + auto found_param = std::find(params.begin(), params.end(), expr); + OPENVINO_ASSERT(found_param != params.end(), "find_param didn't found parameter for expr"); + unsqueezed_params.insert(std::distance(params.begin(), found_param)); + } + }; + + std::unordered_set visited; + for (const auto& brgemm : brgemms) { + const auto& brgemm_b_input = brgemm->get_input_port_connector(1)->get_source().get_expr(); + utils::visit_path(brgemm_b_input, visited, add_param, true); + } + return unsqueezed_params; +} + +std::vector MHAParallelWAOptimizer::find_loops_to_split( + const lowered::LinearIRCPtr& linear_ir, + const std::unordered_set& unsqueezed_params) { + const auto loop_manager = linear_ir->get_loop_manager(); + std::set loop_idces_to_split; + std::vector prev_loop_idces; + + auto add_loop_idx_to_split = [&](const lowered::ExpressionPtr& expr) { + const auto& loop_idces = expr->get_loop_ids(); + if (loop_idces != prev_loop_idces) { + prev_loop_idces = loop_idces; + for (const auto& loop_id : loop_idces) { + const auto expanded_loop_info = loop_manager->get_loop_info(loop_id); + if (expanded_loop_info->get_dim_idx() == m_dim_idx) { + loop_idces_to_split.insert(loop_id); + } + } + } + }; + + size_t i = 0; + std::unordered_set visited; + for (const auto& param : linear_ir->get_parameters()) { + if (unsqueezed_params.count(i++)) + continue; + utils::visit_path(param, visited, add_loop_idx_to_split, false); + } + + const auto& loops_map = linear_ir->get_loop_manager()->get_map(); + std::vector loops_to_split; + for (const auto& id : loop_idces_to_split) + loops_to_split.push_back(ov::as_type_ptr(loops_map.at(id))); + return loops_to_split; +} + +} // namespace pass +} // namespace lowered +} // namespace snippets +} // namespace ov \ No newline at end of file diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 1b31556da38ec8..26bd64f227a12a 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -7,10 +7,11 @@ #include "snippets/lowered/pass/compute_buffer_allocation_size.hpp" #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_specific_iterations.hpp" -#include "snippets/pass/split_dimension_m.hpp" +#include "snippets/mha_parallel_wa_optimizer.hpp" +#include "snippets/runtime_optimizer.hpp" #include "snippets/snippets_isa.hpp" -#include "snippets/utils/utils.hpp" #include "snippets/utils/loop_utils.hpp" +#include "snippets/utils/utils.hpp" namespace ov { namespace snippets { @@ -63,7 +64,9 @@ void RuntimeConfigurator::initialization(const lowered::LinearIRCPtr& linear_ir) m_config->m_latest_shapes.resize(m_io_num); m_config->io_data_offsets.resize(m_io_num); m_config->tile_rank = linear_ir->get_config().m_loop_depth; - m_optimizer = MHAParallelWAOptimizer(linear_ir, this); + + if (linear_ir->is_dynamic()) + m_intermediate_runtime_optimizers.register_pass(linear_ir, this); } void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { @@ -72,7 +75,7 @@ void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { m_config->layouts = extract_layouts(); update_loop_info(linear_ir); - m_optimizer.optimize(); + m_intermediate_runtime_optimizers.run(*linear_ir); update_data_offsets(); @@ -80,6 +83,7 @@ void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { // because `ComputeAllocationSize` depends on subtensors which are updated in the table get_kernel_executor_table()->update_state(linear_ir); update_buffer_scratchpad_size(linear_ir); + m_final_runtime_optimizers.run(*linear_ir); m_config->m_latest_shapes = std::move(m_config->shapes); } @@ -345,179 +349,5 @@ RuntimeConfigurator::UnifiedLoopInfoRtParams RuntimeConfigurator::get_loop_runti }); return rt_params; } - -const size_t RuntimeConfigurator::MHAParallelWAOptimizer::m_dim_idx = 1; - -RuntimeConfigurator::MHAParallelWAOptimizer::MHAParallelWAOptimizer( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - RuntimeConfigurator* configurator) - : configurator(configurator) { - OPENVINO_ASSERT(configurator != nullptr, "Configurator is nullptr"); - - if (linear_ir->get_config().m_enable_domain_optimization || !linear_ir->is_dynamic()) - return; - - const auto brgemms = find_applicable_brgemms(linear_ir); - // Parallel WA optimization is Brgemm related - if (brgemms.empty()) - return; - - concurrency = linear_ir->get_config().m_min_parallel_work_amount; - // At the moment this optimization is Brgemm related so there must be `unsqueezed_params` - unsqueezed_params = find_unsqueezed_params(linear_ir, brgemms); - OPENVINO_ASSERT(!unsqueezed_params.empty(), "unsqueezed_params mustn't be empty after initialization"); - loops_to_split = find_loops_to_split(linear_ir, unsqueezed_params); - - m_dim_idces.resize(configurator->m_io_num); - optimized_layouts.resize(configurator->m_io_num); - for (size_t i = 0; i < configurator->m_io_num; ++i) { - const auto& layout = configurator->m_io_descs[i]->get_layout(); - const auto dim_idx = i < configurator->m_in_num ? utils::get_input_dim_idx(layout, m_dim_idx) - : utils::get_output_dim_idx(layout, m_dim_idx); - m_dim_idces[i] = dim_idx; - optimized_layouts[i] = SplitDimensionM::get_updated_order(layout, i < configurator->m_in_num ? dim_idx : layout.size() - 2); - } -} - -bool RuntimeConfigurator::MHAParallelWAOptimizer::enabled() const { - return !loops_to_split.empty(); -} - -bool RuntimeConfigurator::MHAParallelWAOptimizer::optimize() { - OPENVINO_ASSERT(configurator != nullptr, "Configurator is nullptr"); - if (!enabled()) - return false; - - const auto& config = configurator->get_config(); - size_t new_batch_dim, new_kernel_dim; - if (!SplitDimensionM::split(config->master_shape, concurrency, new_batch_dim, new_kernel_dim)) - return false; - auto& master_shape = config->master_shape; - *++master_shape.rbegin() = new_kernel_dim; - master_shape.insert(master_shape.cbegin() + master_shape.size() - 2, new_batch_dim); - configurator->update_tensor_rank(master_shape); - - LoopInfoRuntimeParamsMap initialized_info; - auto updater = [&](const lowered::LoopInfoPtr& loop_info) { - if (const auto unified_loop_info = ov::as_type_ptr(loop_info)) { - if (initialized_info.count(unified_loop_info) == 0) { - if (!ov::is_type(unified_loop_info)) - unified_loop_info->set_work_amount(new_kernel_dim); - utils::update_data_pointer_shifts(unified_loop_info); - initialized_info[unified_loop_info] = get_loop_runtime_params(unified_loop_info); - } - } else if (const auto expanded_loop_info = ov::as_type_ptr(loop_info)) { - configurator->update_expanded_loop_info(expanded_loop_info, initialized_info); - } else { - OPENVINO_THROW("Failed to update loop info: unknown type!"); - } - }; - lowered::LoopInfoSet updated_loops; - for (const auto& loop : loops_to_split) { - loop->apply(updater, updated_loops); - } - - for (size_t i = 0; i < configurator->m_io_num; ++i) { - config->shapes[i] = unsqueezed_params.count(i) - ? SplitDimensionM::unsqueeze_m_dim(config->shapes[i], m_dim_idces[i]) - : SplitDimensionM::reshape_m_dim(config->shapes[i], m_dim_idces[i], new_batch_dim, new_kernel_dim); - } - config->layouts = optimized_layouts; - return true; -} - -std::unordered_set RuntimeConfigurator::MHAParallelWAOptimizer::find_applicable_brgemms( - const lowered::LinearIRCPtr& linear_ir) { - auto is_brgemm = [](const ExpressionPtr& expr) { - return ov::is_type(expr->get_node()); - }; - auto brgemm_it = std::find_if(linear_ir->begin(), linear_ir->end(), is_brgemm); - std::unordered_set brgemms; - while (brgemm_it != linear_ir->end()) { - brgemms.insert(*brgemm_it); - brgemm_it = std::find_if(std::next(brgemm_it), linear_ir->end(), is_brgemm); - } - const auto& loop_manager = linear_ir->get_loop_manager(); - // Brgemm is applicable if it has dynamic loop by M - // The loop by M is necessary since only in this case we can regulate BrgemmExecutor parameters (via loop's work amount) - // Only dynamic loops are applicable since in static case LoopEnd expressions are not updated during code generation and compiled as is - // Ticket: 148805 - auto applicable_brgemm = [&loop_manager](const ExpressionPtr& expr) { - const auto& loop_idces = expr->get_loop_ids(); - if (loop_idces.empty()) - return false; - const auto& outermost_loop = loop_manager->get_loop_info(loop_idces[0]); - if (!utils::is_dynamic_value(outermost_loop->get_work_amount())) - return false; - bool loop_by_m = true; - outermost_loop->iterate_through_ports([&loop_by_m](const LoopPort& port) { - if (port.is_incremented && port.dim_idx != m_dim_idx) - loop_by_m = false; - }); - return loop_by_m; - }; - // Note: if at least one brgemm is inapplicable, the parallel work amount optimization can't be applied - return std::all_of(brgemms.begin(), brgemms.end(), applicable_brgemm) ? brgemms : std::unordered_set{}; -} - -std::unordered_set RuntimeConfigurator::MHAParallelWAOptimizer::find_unsqueezed_params( - const lowered::LinearIRCPtr& linear_ir, - const std::unordered_set& brgemms) { - const auto& params = linear_ir->get_parameters(); - std::unordered_set unsqueezed_params; - auto add_param = [¶ms, &unsqueezed_params](const ExpressionPtr& expr) { - if (ov::is_type(expr->get_node())) { - auto found_param = std::find(params.begin(), params.end(), expr); - OPENVINO_ASSERT(found_param != params.end(), "find_param didn't found parameter for expr"); - unsqueezed_params.insert(std::distance(params.begin(), found_param)); - } - }; - - std::unordered_set visited; - for (const auto& brgemm : brgemms) { - const auto& brgemm_b_input = brgemm->get_input_port_connector(1)->get_source().get_expr(); - utils::visit_path(brgemm_b_input, visited, add_param, true); - } - return unsqueezed_params; -} - -std::vector RuntimeConfigurator::MHAParallelWAOptimizer::find_loops_to_split( - const lowered::LinearIRCPtr& linear_ir, - const std::unordered_set& unsqueezed_params) { - const auto loop_manager = linear_ir->get_loop_manager(); - std::set loop_idces_to_split; - std::vector prev_loop_idces; - - auto add_loop_idx_to_split = [&](const ExpressionPtr& expr) { - const auto& loop_idces = expr->get_loop_ids(); - if (loop_idces != prev_loop_idces) { - prev_loop_idces = loop_idces; - for (const auto& loop_id : loop_idces) { - const auto expanded_loop_info = loop_manager->get_loop_info(loop_id); - if (expanded_loop_info->get_dim_idx() == m_dim_idx) { - loop_idces_to_split.insert(loop_id); - } - } - } - }; - - size_t i = 0; - std::unordered_set visited; - // The idea is to traverse LIR down from the M dimension related parameters - // and find all the outermost loops: these loops will be split in runtime - for (const auto& param : linear_ir->get_parameters()) { - // Ops after non related params mustn't be traversed - if (unsqueezed_params.count(i++)) - continue; - utils::visit_path(param, visited, add_loop_idx_to_split, false); - } - - const auto& loops_map = linear_ir->get_loop_manager()->get_map(); - std::vector loops_to_split; - for (const auto& id : loop_idces_to_split) - loops_to_split.push_back(ov::as_type_ptr(loops_map.at(id))); - return loops_to_split; -} - } // namespace snippets } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.cpp b/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.cpp new file mode 100644 index 00000000000000..965847c9053e1b --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.cpp @@ -0,0 +1,44 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_copy_b_loop_ports_adjuster.hpp" +#include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp" + +namespace ov { +namespace intel_cpu { + +BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator) + : ov::snippets::lowered::pass::RuntimeOptimizer(configurator) { + const auto& pass = std::make_shared(); + pass->run(*linear_ir); + const auto& affected_uni_loops = pass->get_affected_loops(); + const auto& loop_map = linear_ir->get_loop_manager()->get_map(); + for (const auto& p : loop_map) { + if (const auto& exp_loop = ov::as_type_ptr(p.second)) { + const auto& uni_loop = exp_loop->get_unified_loop_info(); + if (affected_uni_loops.count(uni_loop)) + m_affected_uni2exp_map[uni_loop].push_back(exp_loop); + } + } +} + +bool BrgemmCopyBLoopPortsAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { + if (m_affected_uni2exp_map.empty()) + return false; + + for (const auto& p : m_affected_uni2exp_map) { + const auto& uni_loop = p.first; + const auto& exp_loops = p.second; + snippets::RuntimeConfigurator::LoopInfoRuntimeParamsMap initialized_info; + if (intel_cpu::pass::AdjustBrgemmCopyBLoopPorts::update_loop_info(uni_loop)) { + initialized_info[uni_loop] = snippets::RuntimeConfigurator::get_loop_runtime_params(uni_loop); + for (const auto& exp_loop : exp_loops) + snippets::RuntimeConfigurator::update_expanded_loop_info(exp_loop, initialized_info); + } + } + return true; +} + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp b/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp new file mode 100644 index 00000000000000..c0264b43278a72 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp @@ -0,0 +1,28 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "cpu_runtime_configurator.hpp" +#include "snippets/lowered/linear_ir.hpp" +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/runtime_optimizer.hpp" + +namespace ov { +namespace intel_cpu { + +class BrgemmCopyBLoopPortsAdjuster : public ov::snippets::lowered::pass::RuntimeOptimizer { +public: + BrgemmCopyBLoopPortsAdjuster() = default; + BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator); + + bool run(const snippets::lowered::LinearIR& linear_ir) override; + +private: + std::unordered_map> m_affected_uni2exp_map; +}; + +} // namespace intel_cpu +} // namespace ov \ No newline at end of file diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index ebc2a8ecc1f6ab..16a040d29a3ff7 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -11,9 +11,10 @@ #include "snippets/utils/utils.hpp" #ifndef OPENVINO_ARCH_ARM64 +#include "brgemm_copy_b_loop_ports_adjuster.hpp" +#include "external_repacking_adjuster.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_utils.hpp" -#include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp" #endif namespace ov { namespace intel_cpu { @@ -46,10 +47,11 @@ CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigur void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { RuntimeConfigurator::initialization(linear_ir); - if (linear_ir->is_dynamic()) { - loopPortsAdjuster = BrgemmCopyBLoopPortsAdjuster(linear_ir); - } - m_runtime_optimizers.register_pass(linear_ir, this); +#ifndef OPENVINO_ARCH_ARM64 + if (linear_ir->is_dynamic()) + m_intermediate_runtime_optimizers.register_pass(linear_ir, this); + m_final_runtime_optimizers.register_pass(linear_ir, this); +#endif } void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { @@ -60,10 +62,7 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l update_loop_info(linear_ir); } - m_optimizer.optimize(); - - if (linear_ir->is_dynamic()) - loopPortsAdjuster.optimize(); + m_intermediate_runtime_optimizers.run(*linear_ir); // Update KernelExecutor Table should be before `update_buffer_scratchpad_size` // because `ComputeAllocationSize` depends on subtensors which are updated in the table @@ -74,7 +73,7 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l update_loop_args(linear_ir); } update_data_offsets(); - m_runtime_optimizers.run(*linear_ir); + m_final_runtime_optimizers.run(*linear_ir); m_config->m_latest_shapes = std::move(m_config->shapes); } @@ -108,42 +107,5 @@ void CPURuntimeConfigurator::update_loop_args(const ov::snippets::lowered::Linea } } } -#ifdef OPENVINO_ARCH_ARM64 -CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - CPURuntimeConfigurator* configurator) {} - -void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - const std::vector& shapes, - const std::vector>& layouts) {} -#else -CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { - const auto& pass = std::make_shared(); - pass->run(*linear_ir); - const auto& affected_uni_loops = pass->get_affected_loops(); - const auto& loop_map = linear_ir->get_loop_manager()->get_map(); - for (const auto& p : loop_map) { - if (const auto& exp_loop = ov::as_type_ptr(p.second)) { - const auto& uni_loop = exp_loop->get_unified_loop_info(); - if (affected_uni_loops.count(uni_loop)) - m_affected_uni2exp_map[uni_loop].push_back(exp_loop); - } - } -} - -void CPURuntimeConfigurator::BrgemmCopyBLoopPortsAdjuster::optimize() { - for (const auto& p : m_affected_uni2exp_map) { - const auto& uni_loop = p.first; - const auto& exp_loops = p.second; - snippets::RuntimeConfigurator::LoopInfoRuntimeParamsMap initialized_info; - if (intel_cpu::pass::AdjustBrgemmCopyBLoopPorts::update_loop_info(uni_loop)) { - initialized_info[uni_loop] = get_loop_runtime_params(uni_loop); - for (const auto& exp_loop : exp_loops) - update_expanded_loop_info(exp_loop, initialized_info); - } - } -} -#endif } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index a77e055bd6497a..a59489b8cc6fc1 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -5,7 +5,6 @@ #pragma once #include "emitters/snippets/jit_snippets_call_args.hpp" -#include "external_repacking_adjuster.hpp" #include "memory_desc/cpu_blocked_memory_desc.h" #include "snippets/lowered/port_descriptor.hpp" #include "snippets/runtime_configurator.hpp" @@ -28,34 +27,24 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { public: + OPENVINO_RTTI("CPURuntimeConfigurator", "0", ov::snippets::RuntimeConfigurator) CPURuntimeConfigurator(); -protected: - void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; - void update_tensor_rank(const ov::snippets::VectorDims& master_shape) override; - void init_tensor_rank(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const override; - void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; /** * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig * @param linear_ir LinearIR */ void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; +protected: + void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; + void update_tensor_rank(const ov::snippets::VectorDims& master_shape) override; + void init_tensor_rank(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const override; + void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; static const size_t rank6D; - class BrgemmCopyBLoopPortsAdjuster { - public: - BrgemmCopyBLoopPortsAdjuster() = default; - BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir); - - void optimize(); - - private: - std::unordered_map> m_affected_uni2exp_map; - } loopPortsAdjuster; - - BrgemmExternalRepackingAdjuster externalRepackingAdjuster; +private: + snippets::lowered::pass::PassPipeline m_cpu_runtime_optimizers; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp index fdd71e8b962730..a9355212d724bd 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp @@ -2,28 +2,19 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "emitters/snippets/external_repacking_adjuster.hpp" +#include "emitters/snippets/cpu_runtime_configurator.hpp" #include "memory_desc/cpu_blocked_memory_desc.h" #include "memory_desc/dnnl_blocked_memory_desc.h" #include "snippets/utils/utils.hpp" -#ifndef OPENVINO_ARCH_ARM64 #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_utils.hpp" -#endif namespace ov { namespace intel_cpu { -#ifdef OPENVINO_ARCH_ARM64 -BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, - snippets::RuntimeConfigurator* configurator) {} - -bool BrgemmExternalRepackingAdjuster::run(ov::snippets::lowered::LinearIR& linear_ir) { - return false; -} -#else BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster( const ov::snippets::lowered::LinearIRCPtr& linear_ir, snippets::RuntimeConfigurator* configurator) : snippets::lowered::pass::RuntimeOptimizer(configurator) { @@ -79,7 +70,6 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin } return true; } -#endif } // namespace intel_cpu } // namespace ov From 32fded10ddf2fe39a6afc6b2b69c6a24a37805c3 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 13 Nov 2024 14:55:47 +0100 Subject: [PATCH 24/42] Serialization passes updated --- .../include/snippets/lowered/pass/serialize_base.hpp | 4 ++-- .../snippets/lowered/pass/serialize_control_flow.hpp | 7 +------ .../snippets/lowered/pass/serialize_data_flow.hpp | 7 +------ .../include/snippets/utils/linear_ir_pass_dumper.hpp | 4 ++-- src/common/snippets/src/lowered/pass/pass.cpp | 9 +++++++-- 5 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/common/snippets/include/snippets/lowered/pass/serialize_base.hpp b/src/common/snippets/include/snippets/lowered/pass/serialize_base.hpp index 51cc528a155a00..560744f4eb09d8 100644 --- a/src/common/snippets/include/snippets/lowered/pass/serialize_base.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/serialize_base.hpp @@ -16,9 +16,9 @@ namespace pass { * @brief Base class for LinearIR serialization passes * @ingroup snippets */ -class SerializeBase : public Pass { +class SerializeBase : public ConstPass { public: - OPENVINO_RTTI("SerializeBase", "Pass") + OPENVINO_RTTI("SerializeBase", "ConstPass") SerializeBase(const std::string& xml_path); protected: diff --git a/src/common/snippets/include/snippets/lowered/pass/serialize_control_flow.hpp b/src/common/snippets/include/snippets/lowered/pass/serialize_control_flow.hpp index 602e9d9df7ce32..2e8f91aed6c08d 100644 --- a/src/common/snippets/include/snippets/lowered/pass/serialize_control_flow.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/serialize_control_flow.hpp @@ -22,12 +22,7 @@ class SerializeControlFlow : public SerializeBase { OPENVINO_RTTI("SerializeControlFlow", "Pass", SerializeBase) SerializeControlFlow(const std::string& xml_path, bool update_dynamic_ops = false) : SerializeBase(xml_path), m_update_dynamic_ops{update_dynamic_ops} {} - - bool run(LinearIR& linear_ir) override { - return run(const_cast(linear_ir)); - } - // We need a const method to run from functions that can't change LIR - bool run(const LinearIR& linear_ir); + bool run(const LinearIR& linear_ir) override; private: const bool m_update_dynamic_ops = false; diff --git a/src/common/snippets/include/snippets/lowered/pass/serialize_data_flow.hpp b/src/common/snippets/include/snippets/lowered/pass/serialize_data_flow.hpp index ce5b3855400264..ecbc1a834ce388 100644 --- a/src/common/snippets/include/snippets/lowered/pass/serialize_data_flow.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/serialize_data_flow.hpp @@ -23,12 +23,7 @@ class SerializeDataFlow : public SerializeBase { public: OPENVINO_RTTI("SerializeDataFlow", "Pass", SerializeBase) SerializeDataFlow(const std::string& xml_path) : SerializeBase(xml_path) {} - - bool run(LinearIR& linear_ir) override { - return run(const_cast(linear_ir)); - } - // We need a const method to run from functions that can't change LIR - bool run(const LinearIR& linear_ir); + bool run(const LinearIR& linear_ir) override; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/utils/linear_ir_pass_dumper.hpp b/src/common/snippets/include/snippets/utils/linear_ir_pass_dumper.hpp index 85abfc9a91ab31..c8c145d7eac075 100644 --- a/src/common/snippets/include/snippets/utils/linear_ir_pass_dumper.hpp +++ b/src/common/snippets/include/snippets/utils/linear_ir_pass_dumper.hpp @@ -16,7 +16,7 @@ namespace snippets { class LIRPassDump { public: - explicit LIRPassDump(lowered::LinearIR& linear_ir, std::string pass_name) + explicit LIRPassDump(const lowered::LinearIR& linear_ir, std::string pass_name) : linear_ir(linear_ir), pass_name(std::move(pass_name)), debug_config(linear_ir.get_config().debug_config) { dump("_in"); } @@ -44,7 +44,7 @@ class LIRPassDump { num++; } - lowered::LinearIR& linear_ir; + const lowered::LinearIR& linear_ir; const std::string pass_name; const DebugCapsConfig& debug_config; }; diff --git a/src/common/snippets/src/lowered/pass/pass.cpp b/src/common/snippets/src/lowered/pass/pass.cpp index 5dcd75572df8f5..235d248d8e9838 100644 --- a/src/common/snippets/src/lowered/pass/pass.cpp +++ b/src/common/snippets/src/lowered/pass/pass.cpp @@ -30,13 +30,16 @@ void PassPipeline::register_pass(const std::shared_ptr& pass) { void PassPipeline::run(const lowered::LinearIR& linear_ir) const { for (const auto& pass : m_passes) { OPENVINO_ASSERT(pass != nullptr, "PassPipeline has empty pass!"); - // SNIPPETS_DEBUG_LIR_PASS_DUMP(linear_ir, pass); + SNIPPETS_DEBUG_LIR_PASS_DUMP(linear_ir, pass); if (m_pass_config->is_disabled(pass->get_type_info())) { continue; } const auto const_pass = std::dynamic_pointer_cast(pass); - OPENVINO_ASSERT(const_pass != nullptr, "Unexpected pass (", pass->get_type_info(), ") is registered in PassPipeline"); + OPENVINO_ASSERT(const_pass != nullptr, + "Unexpected pass (", + pass->get_type_info(), + ") is registered in PassPipeline. Only ConstPass is allowed."); const_pass->run(linear_ir); } } @@ -55,6 +58,8 @@ void PassPipeline::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearI } if (auto lir_pass = std::dynamic_pointer_cast(pass)) { lir_pass->run(linear_ir); + } else if (auto const_pass = std::dynamic_pointer_cast(pass)) { + const_pass->run(linear_ir); } else if (auto ranged_pass = std::dynamic_pointer_cast(pass)) { ranged_pass->run(linear_ir, begin, end); } else { From aef985f7d8d90e170ca30898fcdacd6b40575a3a Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 13 Nov 2024 15:32:31 +0100 Subject: [PATCH 25/42] Docs and cleanup --- .../snippets/mha_parallel_wa_optimizer.hpp | 10 ++++- .../include/snippets/runtime_configurator.hpp | 38 +++++++++++-------- .../include/snippets/runtime_optimizer.hpp | 5 +++ .../brgemm_copy_b_loop_ports_adjuster.hpp | 4 ++ .../snippets/cpu_runtime_configurator.hpp | 1 - .../snippets/external_repacking_adjuster.hpp | 5 +++ 6 files changed, 45 insertions(+), 18 deletions(-) diff --git a/src/common/snippets/include/snippets/mha_parallel_wa_optimizer.hpp b/src/common/snippets/include/snippets/mha_parallel_wa_optimizer.hpp index 97d0cfce709095..e19cfc095de0aa 100644 --- a/src/common/snippets/include/snippets/mha_parallel_wa_optimizer.hpp +++ b/src/common/snippets/include/snippets/mha_parallel_wa_optimizer.hpp @@ -12,7 +12,15 @@ namespace ov { namespace snippets { namespace lowered { namespace pass { - +/** + * @class MHAParallelWAOptimizer + * @brief Optimizes the dynamic MHA execution increasing parallel work amount dy dividing Brgemm's "M" dimension to "parallel_m" + * and "kernel_m". Uses heuristics from snippets::pass::SplitDimensionM for dimension splitting. + * The optimizer performs the following steps: + * - Identifies applicable Brgemm operations within the LinearIR. + * - Finds parameters whose shapes and layouts need to be adjusted after the split. + * - Determines loops that should be adjusted. + */ class MHAParallelWAOptimizer : public lowered::pass::RuntimeOptimizer { public: MHAParallelWAOptimizer() = default; diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 0550be144d41ca..058caade7a54fd 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -65,22 +65,6 @@ class RuntimeConfigurator { RuntimeConfigurator(std::shared_ptr c); virtual ~RuntimeConfigurator() = default; - // Note that get_type_info_static and get_type_info are needed to mimic OPENVINO_RTTI interface, - // so the standard OPENVINO_RTTI(...) macros could be used in derived classes. - _OPENVINO_HIDDEN_METHOD static const ::ov::DiscreteTypeInfo& get_type_info_static() { - static ::ov::DiscreteTypeInfo type_info_static {"RuntimeConfigurator"}; - type_info_static.hash(); - return type_info_static; - } - - virtual const DiscreteTypeInfo& get_type_info() const { - return get_type_info_static(); - } - - const char* get_type_name() const { - return get_type_info().name; - } - /** * @brief Update RuntimeConfig based on new state of LinearIR and return its * @param linear_ir LinearIR @@ -111,6 +95,18 @@ class RuntimeConfigurator { const std::vector& get_io_data_sizes() const { return m_io_data_sizes; } const std::map>& get_dynamic_buffer_clusters() const { return m_dynamic_buffer_clusters; } + /** + * @brief Computes the offsets for each dimension of a tensor shape. + * + * This function calculates the offsets for each dimension of a tensor shape, which represent the distance between + * consecutive elements of the corresponding dimension. If a dimension size is 1, the next dimension starts + * immediately, and the stride is 0. + * @param shape The shape of the tensor. + * @param offsets The offsets which should be updated. + * @param offsets_size Requested offsets size vector. + * @param dim_step The initial step size for the dimensions. + * @param idx_stride Defines the number of dimensions that should be skipped in the offsets vector. + */ static void compute_offsets(const ov::snippets::VectorDims& shape, ov::snippets::VectorDims& offsets, size_t offsets_size, @@ -122,6 +118,11 @@ class RuntimeConfigurator { std::vector ptr_increments; std::vector finalization_offsets; }; + /** + * @brief Retrieves the runtime parameters for a given UnifiedLoopInfo. + * @param unified_loop_info The UnifiedLoopInfo for which the runtime parameters are to be retrieved. + * @return A LoopInfoRuntimeParams object containing the runtime parameters. + */ static UnifiedLoopInfoRtParams get_loop_runtime_params(const lowered::UnifiedLoopInfoPtr& unified_loop_info); using LoopInfoRuntimeParamsMap = std::unordered_map; /** @@ -129,6 +130,11 @@ class RuntimeConfigurator { * @param linear_ir LinearIR */ static void update_loop_info(const lowered::LinearIRCPtr& linear_ir); + /** + * @brief Updates the ExpandedLoopInfo based on the initialized runtime parameters. + * @param expanded_loop_info The ExpandedLoopInfo to be updated. + * @param initialized_info_map A map containing the initialized runtime parameters for UnifiedLoopInfo. + */ static void update_expanded_loop_info(const lowered::ExpandedLoopInfoPtr& expanded_loop_info, LoopInfoRuntimeParamsMap& initializated_info_map); /** diff --git a/src/common/snippets/include/snippets/runtime_optimizer.hpp b/src/common/snippets/include/snippets/runtime_optimizer.hpp index 1b52d89d7e8a07..99522628e23c07 100644 --- a/src/common/snippets/include/snippets/runtime_optimizer.hpp +++ b/src/common/snippets/include/snippets/runtime_optimizer.hpp @@ -12,6 +12,11 @@ namespace ov { namespace snippets { namespace lowered { namespace pass { +/** + * @class RuntimeOptimizer + * @brief Base class for runtime optimizers that operate on LinearIR and RuntimeConfigurator during + * RuntimeConfigurator::update stage. + */ class RuntimeOptimizer : public ConstPass { public: RuntimeOptimizer() = default; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp b/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp index c0264b43278a72..108d02c9e46b5e 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp @@ -12,6 +12,10 @@ namespace ov { namespace intel_cpu { +/** + * @class BrgemmCopyBLoopPortsAdjuster + * @brief A runtime optimizer that adjusts blocked loops parameters for Brgemm operations which require repacking. + */ class BrgemmCopyBLoopPortsAdjuster : public ov::snippets::lowered::pass::RuntimeOptimizer { public: BrgemmCopyBLoopPortsAdjuster() = default; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index a59489b8cc6fc1..d067316cc78a87 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -27,7 +27,6 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { public: - OPENVINO_RTTI("CPURuntimeConfigurator", "0", ov::snippets::RuntimeConfigurator) CPURuntimeConfigurator(); /** diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp index 451bd6b85fa08a..45a13cf2169090 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp @@ -16,6 +16,11 @@ namespace ov { namespace intel_cpu { class CPURuntimeConfigurator; +/** + * @class BrgemmExternalRepackingAdjuster + * @brief A runtime optimizer that creates the memory descs for BRGEMM inputs which require external repacking. + * The generated memory descs are stored in the CPU runtime config. + */ class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::RuntimeOptimizer { public: BrgemmExternalRepackingAdjuster() = default; From 66773fd13fb5eb4087971d28c8e71261ae21ca6e Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 13 Nov 2024 16:16:03 +0100 Subject: [PATCH 26/42] Further cleanup --- .../include/snippets/runtime_configurator.hpp | 2 +- src/common/snippets/src/runtime_configurator.cpp | 3 +-- .../snippets/brgemm_copy_b_loop_ports_adjuster.cpp | 5 ++++- .../snippets/brgemm_copy_b_loop_ports_adjuster.hpp | 2 +- .../emitters/snippets/cpu_runtime_configurator.cpp | 5 ----- .../emitters/snippets/cpu_runtime_configurator.hpp | 3 --- .../snippets/external_repacking_adjuster.cpp | 10 ++++------ .../snippets/external_repacking_adjuster.hpp | 8 ++------ src/plugins/intel_cpu/src/nodes/subgraph.cpp | 2 -- .../lowered/adjust_brgemm_copy_b_loop_ports.hpp | 9 +++------ .../x64/pass/move_brgemm_repacking_out.cpp | 14 +++++--------- 11 files changed, 21 insertions(+), 42 deletions(-) diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 058caade7a54fd..1cb4c00900f950 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -205,7 +205,7 @@ class RuntimeConfigurator { // [cluster_id -> buffer expressions ] std::map> m_dynamic_buffer_clusters = {}; - // WA: until 148891 is not implemented, 2 pass pipelines for runtime optimizers are necessary since different + // WA: until ticket 148891 is not implemented, 2 pass pipelines for runtime optimizers are necessary since different // optimizers must be called at different pipeline stages. // - Intermediate optimizers must be called right after `update_loop_info` // - Final optimizers must be called after all other RuntimeConfigurator's update methods diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 26bd64f227a12a..e2e3afd6e76356 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -8,7 +8,6 @@ #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_specific_iterations.hpp" #include "snippets/mha_parallel_wa_optimizer.hpp" -#include "snippets/runtime_optimizer.hpp" #include "snippets/snippets_isa.hpp" #include "snippets/utils/loop_utils.hpp" #include "snippets/utils/utils.hpp" @@ -329,7 +328,7 @@ void RuntimeConfigurator::compute_offsets(const ov::snippets::VectorDims& shape, } void RuntimeConfigurator::set_kernel_executor_table(std::shared_ptr table) const { - OPENVINO_ASSERT(table, "Failed to update Kernel Executo Table: passed table is missed"); + OPENVINO_ASSERT(table, "Failed to update Kernel Executor Table: passed table is missed"); m_config->kernel_executor_table = std::move(table); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.cpp b/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.cpp index 965847c9053e1b..164a76a7f25223 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.cpp @@ -3,12 +3,15 @@ // #include "brgemm_copy_b_loop_ports_adjuster.hpp" + +#include "snippets/lowered/loop_manager.hpp" #include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp" namespace ov { namespace intel_cpu { -BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator) +BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, + CPURuntimeConfigurator* configurator) : ov::snippets::lowered::pass::RuntimeOptimizer(configurator) { const auto& pass = std::make_shared(); pass->run(*linear_ir); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp b/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp index 108d02c9e46b5e..be64e111b2f31b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp @@ -6,7 +6,7 @@ #include "cpu_runtime_configurator.hpp" #include "snippets/lowered/linear_ir.hpp" -#include "snippets/lowered/loop_manager.hpp" +#include "snippets/lowered/loop_info.hpp" #include "snippets/runtime_optimizer.hpp" namespace ov { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 16a040d29a3ff7..2ce93fda12b5f2 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -4,17 +4,12 @@ #include "emitters/snippets/cpu_runtime_configurator.hpp" -#include "memory_desc/cpu_blocked_memory_desc.h" -#include "memory_desc/cpu_memory_desc_utils.h" -#include "memory_desc/dnnl_blocked_memory_desc.h" #include "snippets/lowered/loop_manager.hpp" #include "snippets/utils/utils.hpp" #ifndef OPENVINO_ARCH_ARM64 #include "brgemm_copy_b_loop_ports_adjuster.hpp" #include "external_repacking_adjuster.hpp" -#include "transformations/snippets/x64/op/brgemm_cpu.hpp" -#include "transformations/snippets/x64/op/brgemm_utils.hpp" #endif namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index d067316cc78a87..f36c3b28de1fe1 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -41,9 +41,6 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; static const size_t rank6D; - -private: - snippets::lowered::pass::PassPipeline m_cpu_runtime_optimizers; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp index a9355212d724bd..5b08a245c2be6d 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp @@ -6,7 +6,6 @@ #include "emitters/snippets/cpu_runtime_configurator.hpp" #include "memory_desc/cpu_blocked_memory_desc.h" -#include "memory_desc/dnnl_blocked_memory_desc.h" #include "snippets/utils/utils.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" @@ -15,9 +14,9 @@ namespace ov { namespace intel_cpu { -BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster( - const ov::snippets::lowered::LinearIRCPtr& linear_ir, - snippets::RuntimeConfigurator* configurator) : snippets::lowered::pass::RuntimeOptimizer(configurator) { +BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, + CPURuntimeConfigurator* configurator) + : snippets::lowered::pass::RuntimeOptimizer(configurator) { const auto& params = linear_ir->get_parameters(); for (size_t i = 0; i < params.size(); ++i) { const auto& param = params[i]; @@ -41,7 +40,6 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin auto& optimal_descs = cpu_config->m_in_requested_descs; for (const auto& i : m_param_idces_with_external_repacking) { const auto& shape = m_configurator->get_config()->shapes[i]; - // TODO: support orbitrary order const auto& K = *++shape.rbegin(); const auto& N = *shape.rbegin(); @@ -65,7 +63,7 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end()); auto& offsets = cpu_config->io_data_offsets[i]; snippets::RuntimeConfigurator::compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_configurator->get_io_data_sizes()[i], 0); - // TODO: Support non-planar layout + // Ticket 157339: Support non-planar layout OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(m_configurator->get_config()->layouts[i])); } return true; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp index 45a13cf2169090..82072c78c10a95 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp @@ -4,14 +4,10 @@ #pragma once +#include "cpu_runtime_configurator.hpp" #include "snippets/runtime_configurator.hpp" #include "snippets/runtime_optimizer.hpp" -#include "snippets/lowered/port_descriptor.hpp" -#include "emitters/snippets/jit_snippets_call_args.hpp" - -#include "memory_desc/cpu_blocked_memory_desc.h" - namespace ov { namespace intel_cpu { @@ -24,7 +20,7 @@ class CPURuntimeConfigurator; class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::RuntimeOptimizer { public: BrgemmExternalRepackingAdjuster() = default; - BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, snippets::RuntimeConfigurator* configurator); + BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator); bool run(const snippets::lowered::LinearIR& linear_ir) override; diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index d46a6c9c105228..de1c5823a018b0 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -3,8 +3,6 @@ // #include "subgraph.h" -#include "memory_desc/dnnl_blocked_memory_desc.h" -#include "memory_desc/cpu_memory_desc_utils.h" #include "common/primitive_hashing_utils.hpp" #include "dnnl_extension_utils.h" #include "onednn/dnnl.h" diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp index 5c65c7a0282823..794c55d868158a 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp @@ -18,14 +18,11 @@ namespace pass { * Finds loop ports connected to BrgemmCopyB and sets appropriate pointer increments. * @ingroup snippets */ -class AdjustBrgemmCopyBLoopPorts: public snippets::lowered::pass::Pass { +class AdjustBrgemmCopyBLoopPorts: public snippets::lowered::pass::ConstPass { public: AdjustBrgemmCopyBLoopPorts() = default; - OPENVINO_RTTI("AdjustBrgemmCopyBLoopPorts", "Pass"); - bool run(const snippets::lowered::LinearIR& linear_ir); - bool run(snippets::lowered::LinearIR& linear_ir) override { - return run(const_cast(linear_ir)); - } + OPENVINO_RTTI("AdjustBrgemmCopyBLoopPorts", "ConstPass"); + bool run(const snippets::lowered::LinearIR& linear_ir) override; static bool update_loop_info(const snippets::lowered::UnifiedLoopInfoPtr& uni_loop_info); const std::unordered_set& get_affected_loops() { return m_affected_loops; } private: diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp index 6853e9c6ba7928..a6973492f7d95c 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp @@ -7,20 +7,18 @@ #include "cpu/x64/cpu_isa_traits.hpp" #include "openvino/pass/pattern/matcher.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" +#include "openvino/pass/pattern/op/optional.hpp" #include "snippets/itt.hpp" #include "snippets/op/rank_normalization.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" -#include "transformations/snippets/x64/op/brgemm_utils.hpp" namespace ov { namespace intel_cpu { -using namespace snippets::lowered; - - pass::MoveBrgemmRepackingOut::MoveBrgemmRepackingOut() { MATCHER_SCOPE(MoveBrgemmRepackingOut); auto m_param = ov::pass::pattern::wrap_type(); + auto m_rank_norm = ov::pass::pattern::optional(m_param); auto m_copy_b = ov::pass::pattern::wrap_type({m_param}); auto callback = [=](ov::pass::pattern::Matcher& m) { @@ -30,16 +28,14 @@ pass::MoveBrgemmRepackingOut::MoveBrgemmRepackingOut() { const auto copy_b_node = ov::as_type_ptr(copy_b_out.get_node_shared_ptr()); OPENVINO_ASSERT(copy_b_node, "BrgemmCopyB node is null in MoveBrgemmRepackingOut transformation"); - const auto& in_desc = PortDescriptorUtils::get_port_descriptor_ptr(copy_b_node->input(0)); + const auto& in_desc = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(copy_b_node->input(0)); const auto& layout = in_desc->get_layout(); // TODO: - // 1. handle copyB with compensations - // 2. handle non-planar layout + // 1. Ticket 157340: support external repacking for copyB with compensations + // 2. Ticket 157339: support external repacking for non-planar layout if (!ov::snippets::utils::is_planar_layout(layout) || copy_b_node->get_src_element_type() == ov::element::i8 || transformation_callback(copy_b_node)) return false; - std::cout << "copy_b_node = " << copy_b_node << std::endl; - std::cout << "[ INFO ] MoveBrgemmRepackingOut is finished\n"; return ov::replace_output_update_name(copy_b_out, copy_b_node->input_value(0)); }; From 64a9fb9a6b89530715dc01b09b9fa647eeb4c5c2 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 13 Nov 2024 17:20:46 +0100 Subject: [PATCH 27/42] compute_offsets refactoring --- .../include/snippets/runtime_configurator.hpp | 13 +++---------- src/common/snippets/src/runtime_configurator.cpp | 15 +++++++-------- .../snippets/external_repacking_adjuster.cpp | 3 +-- 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 1cb4c00900f950..df425991b0f5cd 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -101,18 +101,11 @@ class RuntimeConfigurator { * This function calculates the offsets for each dimension of a tensor shape, which represent the distance between * consecutive elements of the corresponding dimension. If a dimension size is 1, the next dimension starts * immediately, and the stride is 0. - * @param shape The shape of the tensor. - * @param offsets The offsets which should be updated. - * @param offsets_size Requested offsets size vector. - * @param dim_step The initial step size for the dimensions. + * @param shape The shape for offset computation. + * @param idx The index to get the corresponding offsets and io_data_sizes. * @param idx_stride Defines the number of dimensions that should be skipped in the offsets vector. */ - static void compute_offsets(const ov::snippets::VectorDims& shape, - ov::snippets::VectorDims& offsets, - size_t offsets_size, - size_t dim_step, - size_t idx_stride); - + void compute_offsets(const ov::snippets::VectorDims& shape, size_t idx, size_t idx_stride) const; struct UnifiedLoopInfoRtParams { size_t work_amount = 0; std::vector ptr_increments; diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index e2e3afd6e76356..4f833f05e4cfdd 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -281,10 +281,10 @@ void RuntimeConfigurator::update_data_offsets() const { if (utils::is_dynamic_vdims(shape)) return; - auto& offsets = m_config->io_data_offsets[i]; const auto idx_stride = m_config->tensor_rank - shape.size(); - compute_offsets(shape, offsets, m_config->tensor_rank, m_io_data_sizes[i], idx_stride); + compute_offsets(shape, i, idx_stride); + auto& offsets = m_config->io_data_offsets[i]; const auto& layout = layouts[i]; if (!layout.empty()) { std::vector reordered_offsets(offsets.size()); @@ -313,12 +313,11 @@ std::vector> RuntimeConfigurator::extract_layouts() const { return layouts; } -void RuntimeConfigurator::compute_offsets(const ov::snippets::VectorDims& shape, - ov::snippets::VectorDims& offsets, - size_t offsets_size, - size_t dim_step, - size_t idx_stride) { - offsets.resize(offsets_size); +void RuntimeConfigurator::compute_offsets(const ov::snippets::VectorDims& shape, size_t idx, size_t idx_stride) const { + auto& offsets = m_config->io_data_offsets[idx]; + auto dim_step = m_io_data_sizes[idx]; + + offsets.resize(m_config->tensor_rank); std::fill(offsets.begin(), offsets.end(), 0); offsets[offsets.size() - 1] = dim_step; for (int i = static_cast(shape.size()) - 2; i >= 0; i--) { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp index 5b08a245c2be6d..d403defa440b9a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp @@ -61,8 +61,7 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1); shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end()); - auto& offsets = cpu_config->io_data_offsets[i]; - snippets::RuntimeConfigurator::compute_offsets(shape_for_offset, offsets, shape_for_offset.size(), m_configurator->get_io_data_sizes()[i], 0); + m_configurator->compute_offsets(shape_for_offset, i, 0); // Ticket 157339: Support non-planar layout OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(m_configurator->get_config()->layouts[i])); } From 86667039343cf04c5267808ad4d2806ea192acb6 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Fri, 15 Nov 2024 18:04:11 +0100 Subject: [PATCH 28/42] Correct MHA tokenization --- .../include/snippets/pass/tokenization.hpp | 10 ++++++++-- .../snippets/src/pass/mha_tokenization.cpp | 16 ++++++++++----- .../transformation_pipeline.cpp | 14 ++++++++++++- .../custom/subgraph_tests/src/x64/mha.cpp | 20 +++++++++++++++---- .../snippets/mha_quantized.cpp | 2 +- 5 files changed, 49 insertions(+), 13 deletions(-) diff --git a/src/common/snippets/include/snippets/pass/tokenization.hpp b/src/common/snippets/include/snippets/pass/tokenization.hpp index 24efcceec71a24..8a1b293bf9a6af 100644 --- a/src/common/snippets/include/snippets/pass/tokenization.hpp +++ b/src/common/snippets/include/snippets/pass/tokenization.hpp @@ -66,10 +66,10 @@ class SnippetsTokenization : public ov::pass::ModelPass { */ struct Config { Config(size_t concurrency, size_t data_ptr_gpr_count, bool split_m_dimension, bool enable_transpose_on_output, - bool dyn_mha_token, std::set mha_transpose_ranks) + bool dyn_mha_token, std::set mha_transpose_ranks, ov::pass::param_callback mha_tokenize_mm_b_input_callback = nullptr) : m_concurrency(concurrency), m_data_ptr_gpr_count(data_ptr_gpr_count), m_split_m_dimension(split_m_dimension), m_mha_token_enable_transpose_on_output(enable_transpose_on_output), m_is_dynamic_mha_token_enabled(dyn_mha_token), - m_mha_supported_transpose_ranks(std::move(mha_transpose_ranks)) { + m_mha_supported_transpose_ranks(std::move(mha_transpose_ranks)), m_mha_tokenize_mm_b_input_callback(mha_tokenize_mm_b_input_callback) { OPENVINO_ASSERT(concurrency > 0, "Concurrency should be greater than 0"); OPENVINO_ASSERT(data_ptr_gpr_count > 0, "data_ptr_gpr_count should be greater than 0"); } @@ -102,6 +102,10 @@ class SnippetsTokenization : public ov::pass::ModelPass { return m_mha_supported_transpose_ranks; } + bool mha_tokenize_mm_b_input_callback(const std::shared_ptr& node) const { + return m_mha_tokenize_mm_b_input_callback ? m_mha_tokenize_mm_b_input_callback(node) : false; + } + private: size_t m_concurrency = 0; // The number of gpr that can be used as data pointers for data nodes (Parameter (and non-Scalar Constants), @@ -121,6 +125,8 @@ class SnippetsTokenization : public ov::pass::ModelPass { // Note that in general Snippets support Transpose of any ranks. // But at the moment Transpose is used only in MHA pattern where 3D and 4D tensors are supported. std::set m_mha_supported_transpose_ranks = { 3, 4 }; + + ov::pass::param_callback m_mha_tokenize_mm_b_input_callback = nullptr; }; OPENVINO_RTTI("SnippetsTokenization", "0"); diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp index beb465ab3a3fbe..96babcd54e5161 100644 --- a/src/common/snippets/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/src/pass/mha_tokenization.cpp @@ -355,7 +355,9 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken // We can allow to call this pass only if ops have scalar shapes to avoid shape mismatching const auto is_transposed_b_0 = matmul0->get_transpose_b(); bool has_matmul0_has_ops_on_input = false; - while (is_supported_intermediate_op(parent)) { + + const bool support_mm0_b_input_tokenization = !config.mha_tokenize_mm_b_input_callback(matmul0); + while (support_mm0_b_input_tokenization && is_supported_intermediate_op(parent)) { // All supported ops have only one output port if (parent->get_output_target_inputs(0).size() != 1) break; @@ -404,12 +406,16 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken } }; - const auto transpose1 = ov::as_type_ptr(parent); + if (support_mm0_b_input_tokenization) { + const auto transpose1 = ov::as_type_ptr(parent); + tokenize_transpose(transpose1, is_transposed_b_0, get_decomposed_transpose_order(pattern_rank), ordered_ops.begin()); + } const auto transpose0 = ov::as_type_ptr(matmul0->get_input_node_shared_ptr(0)); - const auto transpose2 = ov::as_type_ptr(matmul1->get_input_node_shared_ptr(1)); - tokenize_transpose(transpose1, is_transposed_b_0, get_decomposed_transpose_order(pattern_rank), ordered_ops.begin()); tokenize_transpose(transpose0, matmul0->get_transpose_a(), get_fusion_transpose_order(pattern_rank), ordered_ops.begin()); - tokenize_transpose(transpose2, matmul1->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.end()); + if (!config.mha_tokenize_mm_b_input_callback(matmul1)) { + const auto transpose2 = ov::as_type_ptr(matmul1->get_input_node_shared_ptr(1)); + tokenize_transpose(transpose2, matmul1->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.end()); + } ordered_ops.push_back(matmul1); bool are_ops_after_matmul1 = false; diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 9dd1da2d471e5a..47560a1f334408 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -953,9 +953,21 @@ void Transformations::MainSnippets(void) { bool split_m_dimension = !ignoreCallback; // [122706] Some 3D MHA Patterns have perf regressions when Transpose op is tokenized std::set mha_supported_transpose_ranks = { 4 }; + + // Note: this is a temporary WA, avoiding matmul B input tokenization in the cases when CPU . + // It will be removed when plugin specific SubgraphPass will be implemented. + auto mha_tokenize_mm_b_input_callback = [this](const std::shared_ptr& node) { + const auto& input_type_0 = node->get_input_element_type(0); + const auto& input_type_1 = node->get_input_element_type(1); + + const bool u8i8_repacking_wo_compensations = input_type_0 == ov::element::u8 && input_type_1 == ov::element::i8; + const bool bf16_repacking = input_type_0 == ov::element::f32 && input_type_1 == ov::element::f32 && + config.inferencePrecision == ov::element::bf16; + return u8i8_repacking_wo_compensations || bf16_repacking; + }; snippets::pass::SnippetsTokenization::Config tokenization_config(concurrency, data_ptr_gpr_count, split_m_dimension, mha_token_enable_transpose_on_output, is_dynamic_mha_token_enabled, - mha_supported_transpose_ranks); + mha_supported_transpose_ranks, mha_tokenize_mm_b_input_callback); ov::pass::Manager snippetsManager("CPU:Snippets"); snippetsManager.set_per_pass_validation(false); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp index 8517612a348f68..62b7a3390879e1 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp @@ -3,9 +3,9 @@ // #include "common_test_utils/common_utils.hpp" -#include "common_test_utils/ov_tensor_utils.hpp" #include "common_test_utils/node_builders/constant.hpp" #include "common_test_utils/node_builders/fake_quantize.hpp" +#include "common_test_utils/ov_tensor_utils.hpp" #include "internal_properties.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include "utils/cpu_test_utils.hpp" @@ -666,15 +666,27 @@ std::vector> matMulIn0PrecisionsQuant = { {ElementType::i8, ElementType::u8}, }; -INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern0, +INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern0_i8i8, MHAQuantTest, ::testing::Combine(::testing::ValuesIn(static_shapes_to_test_representation(inputShapesQuant)), ::testing::ValuesIn(inputPrecisionsQuant), - ::testing::ValuesIn(matMulIn0PrecisionsQuant), + ::testing::Values(std::vector{ElementType::i8, ElementType::i8}), ::testing::Values(0), ::testing::Values(ExpectedNodes{ {"Subgraph", 5}, // FQs on inputs x 3 + MHA + Deq Mul - {"Transpose", 1}}), // Transpose between MHA and Deq Mul + {"Transpose", 1}}), // Transpose between MHA and Deq Mul + Extracted transpose on B input of 2nd MM + ::testing::Values(ov::test::utils::DEVICE_CPU)), + MHAQuantTest::getTestCaseName); + +INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern0_i8u8, + MHAQuantTest, + ::testing::Combine(::testing::ValuesIn(static_shapes_to_test_representation(inputShapesQuant)), + ::testing::ValuesIn(inputPrecisionsQuant), + ::testing::Values(std::vector{ElementType::i8, ElementType::u8}), + ::testing::Values(0), + ::testing::Values(ExpectedNodes{ + {"Subgraph", 5}, // FQs on inputs x 3 + MHA + Deq Mul + {"Transpose", 2}}), // Transpose between MHA and Deq Mul + Extracted transpose on B input of 2nd MM ::testing::Values(ov::test::utils::DEVICE_CPU)), MHAQuantTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp index 0c731b74565863..0a12e0a36a3621 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp @@ -48,7 +48,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply ::testing::Values(MHA::default_thread_count), - ::testing::Values(6), // FQx3 on inputs + MHA + Transpose on output + Deq Mul + ::testing::Values(7), // FQx3 on inputs + MHA + Transpose on output + Transpose on Matmul's B input + Deq Mul ::testing::Values(5), // FQx3 on inputs + MHA + Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), From 435cf45ce17c9b80fe2c1af479cfb30a71d98de9 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Mon, 18 Nov 2024 19:15:48 +0100 Subject: [PATCH 29/42] Cover SplitDimensionM heuristic by unit tests --- .../tests/include/utils/split_dim_m.hpp | 37 ++++++++++ .../snippets/tests/src/utils/split_dim_m.cpp | 71 +++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 src/common/snippets/tests/include/utils/split_dim_m.hpp create mode 100644 src/common/snippets/tests/src/utils/split_dim_m.cpp diff --git a/src/common/snippets/tests/include/utils/split_dim_m.hpp b/src/common/snippets/tests/include/utils/split_dim_m.hpp new file mode 100644 index 00000000000000..3e04c2a911d76a --- /dev/null +++ b/src/common/snippets/tests/include/utils/split_dim_m.hpp @@ -0,0 +1,37 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ov { +namespace test { +namespace snippets { + +struct InputData { + size_t cur_batch; + size_t cur_m; + size_t concurrency; +}; + +struct ReferenceData { + bool is_split; + size_t batch_m; + size_t kernel_m; +}; + +struct SplitDimensionMParams { + InputData input; + ReferenceData reference; +}; + +class SplitDimensionMTest : public testing::TestWithParam { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +}; + +} // namespace snippets +} // namespace test +} // namespace ov diff --git a/src/common/snippets/tests/src/utils/split_dim_m.cpp b/src/common/snippets/tests/src/utils/split_dim_m.cpp new file mode 100644 index 00000000000000..69a04da6f1263f --- /dev/null +++ b/src/common/snippets/tests/src/utils/split_dim_m.cpp @@ -0,0 +1,71 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "utils/split_dim_m.hpp" + +#include "common_test_utils/ov_test_utils.hpp" +#include "snippets/pass/split_dimension_m.hpp" +#include "snippets/utils/utils.hpp" + +namespace ov { +namespace test { +namespace snippets { + +std::string SplitDimensionMTest::getTestCaseName(testing::TestParamInfo obj) { + const auto& input = obj.param.input; + const auto& reference = obj.param.reference; + std::ostringstream result; + result << "Batch=" << input.cur_batch << "_"; + result << "CurM=" << input.cur_m << "_"; + result << "OptimalParallelWorkAmount=" << input.concurrency << "_"; + result << "IsSplit=" << reference.is_split << "_"; + result << "BatchM=" << reference.batch_m << "_"; + result << "KernelM=" << reference.kernel_m; + return result.str(); +} + +TEST_P(SplitDimensionMTest, SplitDimensionM) { + const auto& input = GetParam().input; + const auto& reference = GetParam().reference; + + // last_dim is fixed since it doesn't affect the SplitDimensionM result. + static const size_t last_dim = 1024; + ov::Shape shape = {input.cur_batch, input.cur_m, last_dim}; + size_t batch_m_dim, new_m_dim; + bool result = ov::snippets::pass::SplitDimensionM::split(shape, + input.concurrency, + batch_m_dim, + new_m_dim); + + ASSERT_EQ(result, reference.is_split); + if (result) { + ASSERT_EQ(batch_m_dim, reference.batch_m); + ASSERT_EQ(new_m_dim, reference.kernel_m); + } +} + +namespace SplitDimensionMInstantiation { +const std::vector split_dimension_cases = { + // Negative test cases: split is not needed + {InputData{40 /*cur_batch*/, 32 /*cur_m*/, 40 /*concurrency*/}, ReferenceData{false /*is_split*/}}, + {InputData{65, 32, 40}, ReferenceData{false}}, + + // Positive test cases + {InputData{20 /*cur_batch*/, 32 /*cur_m*/, 40 /*concurrency*/}, ReferenceData{true /*is_split*/, 2 /*batch_m*/, 16 /*kernel_m*/}}, + {InputData{30, 60, 40}, ReferenceData{true, 2, 30}}, + {InputData{10, 100, 40}, ReferenceData{true, 4, 25}}, + {InputData{15, 45, 40}, ReferenceData{true, 5, 9}}, + {InputData{25, 50, 40}, ReferenceData{true, 2, 25}}, + {InputData{5, 16384, 40}, ReferenceData{true, 8, 2048}}, +}; + +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SplitDimensionM, + SplitDimensionMTest, + ::testing::ValuesIn(split_dimension_cases), + SplitDimensionMTest::getTestCaseName); + +} // namespace SplitDimensionMInstantiation +} // namespace snippets +} // namespace test +} // namespace ov \ No newline at end of file From 1c64d03c0b510c6b0779b009a3ad9eb5ee0337a8 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Mon, 18 Nov 2024 21:10:40 +0100 Subject: [PATCH 30/42] [WIP] Change splitM heuristic --- .../snippets/src/pass/split_dimension_m.cpp | 30 +++++++++---------- .../tests/src/pass/mha_tokenization.cpp | 12 ++++---- .../snippets/tests/src/utils/split_dim_m.cpp | 15 +++++----- 3 files changed, 28 insertions(+), 29 deletions(-) diff --git a/src/common/snippets/src/pass/split_dimension_m.cpp b/src/common/snippets/src/pass/split_dimension_m.cpp index 0f50ad27931e04..a263fb8de0a87a 100644 --- a/src/common/snippets/src/pass/split_dimension_m.cpp +++ b/src/common/snippets/src/pass/split_dimension_m.cpp @@ -34,23 +34,23 @@ bool SplitDimensionM::is_supported_matmul(const std::shared_ptr& std::pair SplitDimensionM::get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { std::pair splited = { 1, m_dim }; - const size_t lower_bound = optimal_parallelism_work_amount / batch_dim; - if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0) { - splited.first = lower_bound; - splited.second = m_dim / lower_bound; - OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); - return splited; - } - - const size_t upper_bound = utils::div_up(2 * optimal_parallelism_work_amount, batch_dim); - for (size_t divisor_0 = upper_bound - 1; divisor_0 > 1; divisor_0--) { - size_t divisor_1 = m_dim / divisor_0; - if (divisor_1 * divisor_0 == m_dim) { - splited.first = divisor_0; - splited.second = divisor_1; - break; + // TODO: should we limit minimal kernel_m? + const size_t min_kernel_m = 4; + // Strategy 1: Find a combination such that (batch_dim * splited.first) % optimal_parallelism_work_amount == 0 + for (size_t divisor = 1; divisor <= m_dim; ++divisor) { + if (m_dim % divisor == 0) { + const auto m_batch = divisor; + const auto m_kernel = m_dim / divisor; + if (m_kernel < min_kernel_m) + break; + splited = { m_batch, m_kernel }; + if ((batch_dim * splited.first) % optimal_parallelism_work_amount == 0) { + OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); + return splited; + } } } + OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); return splited; } diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp index c5932ed690d670..9b8d5596cb2ef6 100644 --- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp @@ -171,7 +171,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) { const auto& f = MHASplitMFunction(std::vector{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}}, + std::vector{{4, 32, 12, 64}, {128, 12, 1, 64}, {12, 4, 32, 128}, {1, 128, 12, 64}, {128, 12, 64}}, true); model = f.getOriginal(); model_ref = f.getReference(); @@ -182,7 +182,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) { const auto& f = MHASplitMFunction(std::vector{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, + std::vector{{1, 96, 4, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, false); model = f.getOriginal(); model_ref = f.getReference(); @@ -193,7 +193,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM_withMul) { const auto& f = MHASplitMFunction(std::vector{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, + std::vector{{1, 96, 4, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, true); model = f.getOriginal(); model_ref = f.getReference(); @@ -204,7 +204,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM_withMul) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHAWOTranspose_SplitM) { const auto& f = MHAWOTransposeSplitMFunction(std::vector{{10, 9216, 128}, {10, 128, 9216}, {10, 9216, 128}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{10, 3, 3072, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}}); + std::vector{{10, 9, 1024, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}}); model = f.getOriginal(); model_ref = f.getReference(); config.set_concurrency(18); @@ -212,9 +212,9 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHAWOTranspose_SplitM) { } TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_SplitM_AlmostAllThreads) { - const auto& f = MHAWOTransposeSplitMFunction(std::vector{{5, 30, 32}, {5, 32, 30}, {5, 30, 32}}, + const auto& f = MHAWOTransposeSplitMFunction(std::vector{{5, 60, 32}, {5, 32, 30}, {5, 30, 32}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{5, 10, 3, 32}, {5, 1, 32, 30}, {5, 1, 30, 32}, {5, 30, 32}}); + std::vector{{5, 15, 4, 32}, {5, 1, 32, 30}, {5, 1, 30, 32}, {5, 60, 32}}); model = f.getOriginal(); model_ref = f.getReference(); config.set_concurrency(32); diff --git a/src/common/snippets/tests/src/utils/split_dim_m.cpp b/src/common/snippets/tests/src/utils/split_dim_m.cpp index 69a04da6f1263f..db574a38f54685 100644 --- a/src/common/snippets/tests/src/utils/split_dim_m.cpp +++ b/src/common/snippets/tests/src/utils/split_dim_m.cpp @@ -48,16 +48,15 @@ TEST_P(SplitDimensionMTest, SplitDimensionM) { namespace SplitDimensionMInstantiation { const std::vector split_dimension_cases = { // Negative test cases: split is not needed - {InputData{40 /*cur_batch*/, 32 /*cur_m*/, 40 /*concurrency*/}, ReferenceData{false /*is_split*/}}, - {InputData{65, 32, 40}, ReferenceData{false}}, + {InputData{32 /*cur_batch*/, 32 /*cur_m*/, 32 /*concurrency*/}, ReferenceData{false /*is_split*/}}, + {InputData{50, 32, 32}, ReferenceData{false}}, // Positive test cases - {InputData{20 /*cur_batch*/, 32 /*cur_m*/, 40 /*concurrency*/}, ReferenceData{true /*is_split*/, 2 /*batch_m*/, 16 /*kernel_m*/}}, - {InputData{30, 60, 40}, ReferenceData{true, 2, 30}}, - {InputData{10, 100, 40}, ReferenceData{true, 4, 25}}, - {InputData{15, 45, 40}, ReferenceData{true, 5, 9}}, - {InputData{25, 50, 40}, ReferenceData{true, 2, 25}}, - {InputData{5, 16384, 40}, ReferenceData{true, 8, 2048}}, + {InputData{20 /*cur_batch*/, 32 /*cur_m*/, 32 /*concurrency*/}, ReferenceData{true /*is_split*/, 8 /*batch_m*/, 4 /*kernel_m*/}}, + {InputData{16, 60, 32}, ReferenceData{true, 2, 30}}, + {InputData{10, 100, 32}, ReferenceData{true, 25, 4}}, + {InputData{25, 50, 32}, ReferenceData{true, 10, 5}}, + {InputData{5, 16384, 32}, ReferenceData{true, 32, 512}}, }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SplitDimensionM, From 63e987606a81e488b60455e0e9840313c766822e Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 19 Nov 2024 10:23:47 +0100 Subject: [PATCH 31/42] Correct Transpose tokenization in tests --- src/common/snippets/src/pass/collapse_subgraph.cpp | 5 ++++- .../tests/functional/shared_tests_instances/snippets/mha.cpp | 2 +- .../shared_tests_instances/snippets/mha_with_dyn_mul.cpp | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp index 0f0cc225173479..6348f89598523d 100644 --- a/src/common/snippets/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/src/pass/collapse_subgraph.cpp @@ -51,9 +51,12 @@ auto is_supported_op(const std::shared_ptr &n) -> bool { const auto parent = transpose->get_input_node_shared_ptr(0); const auto child = transpose->get_output_target_inputs(0).begin()->get_node()->shared_from_this(); auto is_brgemm_case = ov::is_type(parent) || ov::is_type(child); + auto decomposition_case = true; // Check for Transpose parent is MatMul inside Subgraph if (const auto subgraph = ov::as_type_ptr(parent)) { if (GetSnippetsSubgraphType(subgraph) != SnippetsSubgraphType::Completed) { + // Transpose decomposition is supported only for Transpose nodes right after Subgraph's parameters + decomposition_case = false; const auto body = subgraph->body_ptr(); const auto subgraph_output = body->get_results()[transpose->input_value(0).get_index()]->get_input_node_shared_ptr(0); is_brgemm_case = is_brgemm_case || ov::is_type(subgraph_output); @@ -63,7 +66,7 @@ auto is_supported_op(const std::shared_ptr &n) -> bool { const auto& order = as_type_ptr(n->get_input_node_shared_ptr(1)); if (order) { const auto order_value = order->cast_vector(); - return (TransposeDecomposition::is_supported_transpose_order(order_value)) || + return (decomposition_case && TransposeDecomposition::is_supported_transpose_order(order_value)) || (is_brgemm_case && FuseTransposeBrgemm::is_supported_transpose_order(order_value)); } } diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp index 63f5176684ccc1..45bb055d086910 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp @@ -131,7 +131,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16, ::testing::Values(ov::element::bf16), ::testing::ValuesIn({false}), ::testing::Values(MHA::default_thread_count), - ::testing::Values(7), + ::testing::Values(9), ::testing::Values(6), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp index 7876d737af2281..ccd23dd6833f98 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp @@ -56,7 +56,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::ValuesIn(precision_f32(5)), ::testing::Values(ov::element::bf16), ::testing::Values(MHA::default_thread_count), - ::testing::Values(8), // MHA + 1 Transpose on output + 6 Converts around + ::testing::Values(10), // MHA + 1 Transpose on output + 6 Converts around + 2 Transposes on Matmul's B inputs ::testing::Values(7), // MHA + 6 Converts around ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), From ef2a1a69ecfe144ad5c4d9714ef3f2d285fb0622 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 19 Nov 2024 15:07:08 +0100 Subject: [PATCH 32/42] Enable u8i8 and bf16 MHA tokenization with transpose_b=true --- .../src/transformations/transformation_pipeline.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 47560a1f334408..05dfb6a377ec91 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -1018,9 +1018,12 @@ void Transformations::MainSnippets(void) { // Only FP32 dynamic MHA is supported if (matmul->is_dynamic()) return false; - // [114487] brgemm kernel in oneDNN requires brgemm_copy_b kernel if MatMul node has transposed_b=True - // The current solution with ExtractExplicitMatMulTranspose pass is slower for non-f32 cases than using of brgemm_copy_b kernel - if (matmul->get_transpose_a() || matmul->get_transpose_b()) + // Ticket 157340: repacking extraction is not supported for i8i8 case. + // If the repacking is performed inside the kernel, it may lead to performance degradation. + if (is_int8 && matmul->get_transpose_b()) + return false; + + if (matmul->get_transpose_a()) return false; // [150842] The execution of Brgemm INT8/BF16 on AMX platforms depends on the value of "K % VNNIFactor". // For more details, please teake a look at the ticket 150842 From 4bc76e2e7d1cf1cf254f26c4557c03754f8547c0 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 19 Nov 2024 19:59:06 +0100 Subject: [PATCH 33/42] Alexandra's comments applied --- .../pass}/mha_parallel_wa_optimizer.hpp | 14 +++--- .../{ => lowered/pass}/runtime_optimizer.hpp | 0 .../include/snippets/runtime_configurator.hpp | 11 ++-- .../pass}/mha_parallel_wa_optimizer.cpp | 50 ++++++++++--------- .../snippets/src/runtime_configurator.cpp | 22 ++++---- .../snippets/cpu_runtime_configurator.cpp | 18 +++---- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 26 ++++------ src/plugins/intel_cpu/src/nodes/subgraph.h | 4 +- .../snippets/x64/op/brgemm_utils.cpp | 4 +- .../brgemm_copy_b_loop_ports_adjuster.cpp | 2 + .../brgemm_copy_b_loop_ports_adjuster.hpp | 4 +- .../lowered}/external_repacking_adjuster.cpp | 19 ++++--- .../lowered}/external_repacking_adjuster.hpp | 5 +- 13 files changed, 90 insertions(+), 89 deletions(-) rename src/common/snippets/include/snippets/{ => lowered/pass}/mha_parallel_wa_optimizer.hpp (82%) rename src/common/snippets/include/snippets/{ => lowered/pass}/runtime_optimizer.hpp (100%) rename src/common/snippets/src/{ => lowered/pass}/mha_parallel_wa_optimizer.cpp (82%) rename src/plugins/intel_cpu/src/{emitters/snippets => transformations/snippets/x64/pass/lowered}/brgemm_copy_b_loop_ports_adjuster.cpp (93%) rename src/plugins/intel_cpu/src/{emitters/snippets => transformations/snippets/x64/pass/lowered}/brgemm_copy_b_loop_ports_adjuster.hpp (89%) rename src/plugins/intel_cpu/src/{emitters/snippets => transformations/snippets/x64/pass/lowered}/external_repacking_adjuster.cpp (80%) rename src/plugins/intel_cpu/src/{emitters/snippets => transformations/snippets/x64/pass/lowered}/external_repacking_adjuster.hpp (88%) diff --git a/src/common/snippets/include/snippets/mha_parallel_wa_optimizer.hpp b/src/common/snippets/include/snippets/lowered/pass/mha_parallel_wa_optimizer.hpp similarity index 82% rename from src/common/snippets/include/snippets/mha_parallel_wa_optimizer.hpp rename to src/common/snippets/include/snippets/lowered/pass/mha_parallel_wa_optimizer.hpp index e19cfc095de0aa..af1e2b60e1d70a 100644 --- a/src/common/snippets/include/snippets/mha_parallel_wa_optimizer.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/mha_parallel_wa_optimizer.hpp @@ -4,9 +4,9 @@ #pragma once -#include "runtime_optimizer.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_info.hpp" +#include "snippets/lowered/pass/runtime_optimizer.hpp" namespace ov { namespace snippets { @@ -37,13 +37,13 @@ class MHAParallelWAOptimizer : public lowered::pass::RuntimeOptimizer { const lowered::LinearIRCPtr& linear_ir, const std::unordered_set& unsqueezed_params); - std::vector loops_to_split{}; - std::unordered_set unsqueezed_params{}; - std::vector> optimized_layouts{}; - std::vector m_dim_idces{}; - size_t concurrency = 0; + std::vector m_loops_to_split{}; + std::unordered_set m_unsqueezed_params{}; + std::vector> m_optimized_layouts{}; + std::vector m_dim_M_idces{}; + size_t m_concurrency = 0; - static const size_t m_dim_idx; + static const size_t m_dim_M_idx; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/runtime_optimizer.hpp b/src/common/snippets/include/snippets/lowered/pass/runtime_optimizer.hpp similarity index 100% rename from src/common/snippets/include/snippets/runtime_optimizer.hpp rename to src/common/snippets/include/snippets/lowered/pass/runtime_optimizer.hpp diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index df425991b0f5cd..773e54458be645 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -8,7 +8,6 @@ #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_info.hpp" #include "snippets/lowered/pass/pass.hpp" -#include "snippets/runtime_optimizer.hpp" namespace ov { namespace snippets { @@ -45,15 +44,15 @@ class RuntimeConfig { size_t tensor_rank = 0; size_t tile_rank = 0; - std::vector shapes = {}; - std::vector layouts = {}; + std::vector io_shapes = {}; + std::vector io_layouts = {}; std::vector io_data_offsets = {}; ov::snippets::VectorDims master_shape = {}; size_t buffer_scratchpad_size = 0; std::vector buffer_cluster_offsets {}; KernelExecutorTablePtr kernel_executor_table = std::make_shared(); - std::vector m_latest_shapes = {}; + std::vector latest_shapes = {}; }; /** @@ -204,8 +203,8 @@ class RuntimeConfigurator { // - Final optimizers must be called after all other RuntimeConfigurator's update methods // When all updates will be rewritten on PassPipeline, PositionedPasses can be used to precisely define the place of // the additional optimizers - lowered::pass::PassPipeline m_intermediate_runtime_optimizers; - lowered::pass::PassPipeline m_final_runtime_optimizers; + lowered::pass::PassPipeline m_intermediate_optimizers; + lowered::pass::PassPipeline m_final_optimizers; }; } // namespace snippets diff --git a/src/common/snippets/src/mha_parallel_wa_optimizer.cpp b/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp similarity index 82% rename from src/common/snippets/src/mha_parallel_wa_optimizer.cpp rename to src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp index bb70af011c6c76..7c4c3085679d6b 100644 --- a/src/common/snippets/src/mha_parallel_wa_optimizer.cpp +++ b/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp @@ -2,13 +2,14 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "snippets/mha_parallel_wa_optimizer.hpp" +#include "snippets/lowered/pass/mha_parallel_wa_optimizer.hpp" +#include "snippets/itt.hpp" +#include "snippets/lowered/loop_info.hpp" +#include "snippets/lowered/loop_manager.hpp" #include "snippets/pass/split_dimension_m.hpp" -#include "snippets/utils/utils.hpp" #include "snippets/utils/loop_utils.hpp" -#include "snippets/lowered/loop_manager.hpp" -#include "snippets/lowered/loop_info.hpp" +#include "snippets/utils/utils.hpp" namespace ov { namespace snippets { @@ -16,7 +17,7 @@ namespace lowered { namespace pass { using namespace ov::snippets::pass; -const size_t MHAParallelWAOptimizer::m_dim_idx = 1; +const size_t MHAParallelWAOptimizer::m_dim_M_idx = 1; MHAParallelWAOptimizer::MHAParallelWAOptimizer(const lowered::LinearIRCPtr& linear_ir, RuntimeConfigurator* configurator) : lowered::pass::RuntimeOptimizer(configurator) { @@ -27,30 +28,31 @@ MHAParallelWAOptimizer::MHAParallelWAOptimizer(const lowered::LinearIRCPtr& line if (brgemms.empty()) return; - concurrency = linear_ir->get_config().m_min_parallel_work_amount; - unsqueezed_params = find_unsqueezed_params(linear_ir, brgemms); - OPENVINO_ASSERT(!unsqueezed_params.empty(), "unsqueezed_params mustn't be empty after initialization"); - loops_to_split = find_loops_to_split(linear_ir, unsqueezed_params); + m_concurrency = linear_ir->get_config().m_min_parallel_work_amount; + m_unsqueezed_params = find_unsqueezed_params(linear_ir, brgemms); + OPENVINO_ASSERT(!m_unsqueezed_params.empty(), "unsqueezed_params mustn't be empty after initialization"); + m_loops_to_split = find_loops_to_split(linear_ir, m_unsqueezed_params); - m_dim_idces.resize(configurator->get_io_num()); - optimized_layouts.resize(configurator->get_io_num()); + m_dim_M_idces.resize(configurator->get_io_num()); + m_optimized_layouts.resize(configurator->get_io_num()); for (size_t i = 0; i < configurator->get_io_num(); ++i) { const auto& layout = configurator->get_io_descs()[i]->get_layout(); - const auto dim_idx = i < configurator->get_in_num() ? utils::get_input_dim_idx(layout, m_dim_idx) - : utils::get_output_dim_idx(layout, m_dim_idx); - m_dim_idces[i] = dim_idx; + const auto dim_idx = i < configurator->get_in_num() ? utils::get_input_dim_idx(layout, m_dim_M_idx) + : utils::get_output_dim_idx(layout, m_dim_M_idx); + m_dim_M_idces[i] = dim_idx; const auto m_idx = i < configurator->get_in_num() ? dim_idx : layout.size() - 2; - optimized_layouts[i] = SplitDimensionM::get_updated_order(layout, m_idx); + m_optimized_layouts[i] = SplitDimensionM::get_updated_order(layout, m_idx); } } bool MHAParallelWAOptimizer::run(const lowered::LinearIR& linear_ir) { - if (loops_to_split.empty()) + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::MHAParallelWAOptimizer") + if (m_loops_to_split.empty()) return false; const auto& config = m_configurator->get_config(); size_t new_batch_dim, new_kernel_dim; - if (!SplitDimensionM::split(config->master_shape, concurrency, new_batch_dim, new_kernel_dim)) + if (!SplitDimensionM::split(config->master_shape, m_concurrency, new_batch_dim, new_kernel_dim)) return false; auto& master_shape = config->master_shape; *++master_shape.rbegin() = new_kernel_dim; @@ -73,16 +75,16 @@ bool MHAParallelWAOptimizer::run(const lowered::LinearIR& linear_ir) { } }; lowered::LoopInfoSet updated_loops; - for (const auto& loop : loops_to_split) { + for (const auto& loop : m_loops_to_split) { loop->apply(updater, updated_loops); } for (size_t i = 0; i < m_configurator->get_io_num(); ++i) { - config->shapes[i] = unsqueezed_params.count(i) - ? SplitDimensionM::unsqueeze_m_dim(config->shapes[i], m_dim_idces[i]) - : SplitDimensionM::reshape_m_dim(config->shapes[i], m_dim_idces[i], new_batch_dim, new_kernel_dim); + config->io_shapes[i] = m_unsqueezed_params.count(i) + ? SplitDimensionM::unsqueeze_m_dim(config->io_shapes[i], m_dim_M_idces[i]) + : SplitDimensionM::reshape_m_dim(config->io_shapes[i], m_dim_M_idces[i], new_batch_dim, new_kernel_dim); } - config->layouts = optimized_layouts; + config->io_layouts = m_optimized_layouts; return true; } @@ -106,7 +108,7 @@ std::unordered_set MHAParallelWAOptimizer::find_applicab return false; bool loop_by_m = true; outermost_loop->iterate_through_ports([&loop_by_m](const lowered::LoopPort& port) { - if (port.is_incremented && port.dim_idx != m_dim_idx) + if (port.is_incremented && port.dim_idx != m_dim_M_idx) loop_by_m = false; }); return loop_by_m; @@ -148,7 +150,7 @@ std::vector MHAParallelWAOptimizer::find_loops_to_ prev_loop_idces = loop_idces; for (const auto& loop_id : loop_idces) { const auto expanded_loop_info = loop_manager->get_loop_info(loop_id); - if (expanded_loop_info->get_dim_idx() == m_dim_idx) { + if (expanded_loop_info->get_dim_idx() == m_dim_M_idx) { loop_idces_to_split.insert(loop_id); } } diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 4f833f05e4cfdd..5a19c61767a22c 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -7,7 +7,7 @@ #include "snippets/lowered/pass/compute_buffer_allocation_size.hpp" #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_specific_iterations.hpp" -#include "snippets/mha_parallel_wa_optimizer.hpp" +#include "snippets/lowered/pass/mha_parallel_wa_optimizer.hpp" #include "snippets/snippets_isa.hpp" #include "snippets/utils/loop_utils.hpp" #include "snippets/utils/utils.hpp" @@ -60,21 +60,21 @@ void RuntimeConfigurator::initialization(const lowered::LinearIRCPtr& linear_ir) init_buffer_info(linear_ir); OPENVINO_ASSERT(m_io_num > 0, "LinearIR must have parameters and results"); - m_config->m_latest_shapes.resize(m_io_num); + m_config->latest_shapes.resize(m_io_num); m_config->io_data_offsets.resize(m_io_num); m_config->tile_rank = linear_ir->get_config().m_loop_depth; if (linear_ir->is_dynamic()) - m_intermediate_runtime_optimizers.register_pass(linear_ir, this); + m_intermediate_optimizers.register_pass(linear_ir, this); } void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { m_config->master_shape = linear_ir->get_master_shape(); - m_config->shapes = extract_shapes(); - m_config->layouts = extract_layouts(); + m_config->io_shapes = extract_shapes(); + m_config->io_layouts = extract_layouts(); update_loop_info(linear_ir); - m_intermediate_runtime_optimizers.run(*linear_ir); + m_intermediate_optimizers.run(*linear_ir); update_data_offsets(); @@ -82,8 +82,8 @@ void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { // because `ComputeAllocationSize` depends on subtensors which are updated in the table get_kernel_executor_table()->update_state(linear_ir); update_buffer_scratchpad_size(linear_ir); - m_final_runtime_optimizers.run(*linear_ir); - m_config->m_latest_shapes = std::move(m_config->shapes); + m_final_optimizers.run(*linear_ir); + m_config->latest_shapes = std::move(m_config->io_shapes); } void RuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { @@ -261,8 +261,8 @@ void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRC } void RuntimeConfigurator::update_data_offsets() const { - const auto& shapes = m_config->shapes; - const auto& layouts = m_config->layouts; + const auto& shapes = m_config->io_shapes; + const auto& layouts = m_config->io_layouts; OPENVINO_ASSERT(shapes.size() == m_io_num, "Number of custom shapes must be 0 or be equal to m_io_num"); OPENVINO_ASSERT(layouts.size() == m_io_num, "Number of custom layouts must be 0 or be equal to m_io_num"); for (size_t i = 0; i < m_io_num; ++i) { @@ -276,7 +276,7 @@ void RuntimeConfigurator::update_data_offsets() const { // offsets: s1*s3, s3, 0, 1 const auto& shape = shapes[i]; OPENVINO_ASSERT(m_config->tensor_rank >= shape.size(), "Incorrect tensor rank!"); - if (shape == m_config->m_latest_shapes[i]) + if (shape == m_config->latest_shapes[i]) continue; if (utils::is_dynamic_vdims(shape)) return; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 2ce93fda12b5f2..d4be7235131ead 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -8,8 +8,8 @@ #include "snippets/utils/utils.hpp" #ifndef OPENVINO_ARCH_ARM64 -#include "brgemm_copy_b_loop_ports_adjuster.hpp" -#include "external_repacking_adjuster.hpp" +#include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp" +#include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp" #endif namespace ov { namespace intel_cpu { @@ -44,20 +44,20 @@ void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearI RuntimeConfigurator::initialization(linear_ir); #ifndef OPENVINO_ARCH_ARM64 if (linear_ir->is_dynamic()) - m_intermediate_runtime_optimizers.register_pass(linear_ir, this); - m_final_runtime_optimizers.register_pass(linear_ir, this); + m_intermediate_optimizers.register_pass(linear_ir, this); + m_final_optimizers.register_pass(linear_ir, this); #endif } void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { m_config->master_shape = linear_ir->get_master_shape(); - m_config->shapes = extract_shapes(); - m_config->layouts = extract_layouts(); + m_config->io_shapes = extract_shapes(); + m_config->io_layouts = extract_layouts(); if (linear_ir->is_dynamic()) { update_loop_info(linear_ir); } - m_intermediate_runtime_optimizers.run(*linear_ir); + m_intermediate_optimizers.run(*linear_ir); // Update KernelExecutor Table should be before `update_buffer_scratchpad_size` // because `ComputeAllocationSize` depends on subtensors which are updated in the table @@ -68,8 +68,8 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l update_loop_args(linear_ir); } update_data_offsets(); - m_final_runtime_optimizers.run(*linear_ir); - m_config->m_latest_shapes = std::move(m_config->shapes); + m_final_optimizers.run(*linear_ir); + m_config->latest_shapes = std::move(m_config->io_shapes); } void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index de1c5823a018b0..aaa12d303bb232 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -77,9 +77,8 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor { const std::vector& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator, - const dnnl::engine& engine) - : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator, engine) {} + const BufferScratchpadAllocator& allocator) + : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {} void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { const auto& callable = m_schedule->get_callable(); @@ -121,9 +120,8 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { const std::vector& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator, - const dnnl::engine& engine) - : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator, engine) { + const BufferScratchpadAllocator& allocator) + : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) { buffer_offsets = snippet_config->buffer_cluster_offsets; data_offsets = snippet_config->io_data_offsets; loop_args = snippet_config->loop_args; @@ -792,8 +790,7 @@ void Subgraph::prepareParams() { start_offset_in, start_offset_out, snippet_config, - allocator, - getEngine()); + allocator); } else { // Static case: // 1. Update runtime config to get static scheduling data (io data offsets, parallel domain) which will be compiled in JIT code @@ -809,8 +806,7 @@ void Subgraph::prepareParams() { start_offset_in, start_offset_out, snippet_config, - allocator, - getEngine()); + allocator); } }; @@ -899,8 +895,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator, - const dnnl::engine& engine) + const BufferScratchpadAllocator& allocator) : m_schedule(snippet->get()), m_start_offset_in(start_offset_in), m_start_offset_out(start_offset_out) { OPENVINO_ASSERT(m_schedule, "Schedule is empty!"); OPENVINO_ASSERT(snippet_config, "Runtime Config is empty!"); @@ -911,7 +906,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptrbuffer_scratchpad_size; OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size), "Undefined buffer scratchpad size!"); - const auto internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; + m_internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; m_in_requested_descs = snippet_config->m_in_requested_descs; const auto external_repacking_buffer_size = std::accumulate(m_in_requested_descs.begin(), @@ -920,7 +915,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& requested_desc_elem) { return sum + requested_desc_elem.second->getCurrentMemSize(); }); - m_buffer_scratchpad = allocator(internal_buffer_size + external_repacking_buffer_size); + m_buffer_scratchpad = allocator(m_internal_buffer_size + external_repacking_buffer_size); #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) const auto target = std::dynamic_pointer_cast(snippet_attrs->snippet->get_generator()->get_target_machine()); @@ -1005,8 +1000,7 @@ void Subgraph::SubgraphExecutor::execute(dnnl::stream strm, std::vector inMemPtrs, const std::vector& outMemPtrs) { - const auto internal_buffer_size = static_cast(m_nthreads) * m_buffer_scratchpad_size; - size_t offset = internal_buffer_size; + size_t offset = m_internal_buffer_size; for (const auto& requested_descs_elem : m_in_requested_descs) { const auto in_idx = requested_descs_elem.first; const auto& requested_desc = requested_descs_elem.second; diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 5b0eed96080023..0cc5258f3d18e7 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -126,8 +126,7 @@ class Subgraph::SubgraphExecutor { const std::vector& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator, - const dnnl::engine& engine); + const BufferScratchpadAllocator& allocator); virtual ~SubgraphExecutor() = default; void execute(dnnl::stream strm, std::vector& inMemPtrs, std::vector& outMemPtrs); @@ -154,6 +153,7 @@ class Subgraph::SubgraphExecutor { // Buffer scratchpad MemoryPtr m_buffer_scratchpad = nullptr; size_t m_buffer_scratchpad_size = 0; + size_t m_internal_buffer_size = 0; const size_t rank6D = 6; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp index 42d9449b49ce8c..424430cdc192f0 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp @@ -6,6 +6,7 @@ #include "dnnl_extension_utils.h" #include "emitters/utils.hpp" +#include "snippets/lowered/expressions/buffer_expression.hpp" #include "snippets/op/buffer.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" @@ -91,7 +92,8 @@ const ov::snippets::lowered::ExpressionPtr get_copy_b_expr(const ov::snippets::l const auto b_input_expr = brgemm_expr->get_input_port_connector(1)->get_source().get_expr(); if (ov::is_type(b_input_expr->get_node())) { return b_input_expr; - } else if (ov::is_type(b_input_expr->get_node())) { + } else if (ov::is_type(b_input_expr)) { + OPENVINO_ASSERT(b_input_expr->get_input_count() >= 1, "BufferExpression on brgemm's B input must have at least one input"); const auto input_buffer_expr = b_input_expr->get_input_port_connector(0)->get_source().get_expr(); if (ov::is_type(b_input_expr->get_node())) { return input_buffer_expr; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp similarity index 93% rename from src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.cpp rename to src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp index 164a76a7f25223..089d91aba809fb 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp @@ -4,6 +4,7 @@ #include "brgemm_copy_b_loop_ports_adjuster.hpp" +#include "snippets/itt.hpp" #include "snippets/lowered/loop_manager.hpp" #include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp" @@ -27,6 +28,7 @@ BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::l } bool BrgemmCopyBLoopPortsAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmCopyBLoopPortsAdjuster") if (m_affected_uni2exp_map.empty()) return false; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp similarity index 89% rename from src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp rename to src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp index be64e111b2f31b..c33cb0d502f19f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/brgemm_copy_b_loop_ports_adjuster.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp @@ -4,10 +4,10 @@ #pragma once -#include "cpu_runtime_configurator.hpp" +#include "emitters/snippets/cpu_runtime_configurator.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/loop_info.hpp" -#include "snippets/runtime_optimizer.hpp" +#include "snippets/lowered/pass/runtime_optimizer.hpp" namespace ov { namespace intel_cpu { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp similarity index 80% rename from src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp rename to src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp index d403defa440b9a..327d82761ad566 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp @@ -2,12 +2,12 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "emitters/snippets/external_repacking_adjuster.hpp" +#include "external_repacking_adjuster.hpp" #include "emitters/snippets/cpu_runtime_configurator.hpp" #include "memory_desc/cpu_blocked_memory_desc.h" +#include "snippets/itt.hpp" #include "snippets/utils/utils.hpp" - #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_utils.hpp" @@ -24,35 +24,40 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp const bool brgemm_with_extracted_repacking = std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) { auto brgemm = ov::as_type_ptr(port.get_expr()->get_node()); - return port.get_index() == 1 && brgemm && brgemm_utils::with_repacking(brgemm->get_type()); + return brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && port.get_index() == 1; }); if (brgemm_with_extracted_repacking) { m_param_idces_with_external_repacking.insert(i); + // Ticket 157339: Support non-planar layout + OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(configurator->get_io_descs()[i]->get_layout()), + "Non-planar layout is not supported for external repacking"); } } } bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmExternalRepackingAdjuster") if (m_param_idces_with_external_repacking.empty()) return false; const auto& cpu_config = ov::as_type_ptr(m_configurator->get_config()); auto& optimal_descs = cpu_config->m_in_requested_descs; for (const auto& i : m_param_idces_with_external_repacking) { - const auto& shape = m_configurator->get_config()->shapes[i]; + const auto& shape = cpu_config->io_shapes[i]; const auto& K = *++shape.rbegin(); const auto& N = *shape.rbegin(); const auto& precision = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0); const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); + const size_t brgemm_kernel_rank = 2; // Firstly, batch dims are set - VectorDims requested_blocked_shape(shape.begin(), shape.end() - cpu_config->tile_rank); + VectorDims requested_blocked_shape(shape.begin(), shape.end() - brgemm_kernel_rank); // Then, the blocked dims are formed requested_blocked_shape.insert( requested_blocked_shape.end(), {snippets::utils::div_up(K, vnni_factor), std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)), vnni_factor}); - VectorDims requested_order(shape.size() - cpu_config->tile_rank); + VectorDims requested_order(shape.size() - brgemm_kernel_rank); std::iota(requested_order.begin(), requested_order.end(), 0); const auto last_idx = shape.size() - 1; requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); @@ -62,8 +67,6 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1); shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end()); m_configurator->compute_offsets(shape_for_offset, i, 0); - // Ticket 157339: Support non-planar layout - OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(m_configurator->get_config()->layouts[i])); } return true; } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp similarity index 88% rename from src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp rename to src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp index 82072c78c10a95..fb22beaca63ae1 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp @@ -4,14 +4,13 @@ #pragma once -#include "cpu_runtime_configurator.hpp" +#include "emitters/snippets/cpu_runtime_configurator.hpp" +#include "snippets/lowered/pass/runtime_optimizer.hpp" #include "snippets/runtime_configurator.hpp" -#include "snippets/runtime_optimizer.hpp" namespace ov { namespace intel_cpu { -class CPURuntimeConfigurator; /** * @class BrgemmExternalRepackingAdjuster * @brief A runtime optimizer that creates the memory descs for BRGEMM inputs which require external repacking. From 6860c67f3a760fc61091eac108d11d286bbd921d Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Tue, 19 Nov 2024 21:48:47 +0100 Subject: [PATCH 34/42] Ivan's comments applied --- src/common/snippets/include/snippets/pass/tokenization.hpp | 2 +- src/common/snippets/include/snippets/runtime_configurator.hpp | 2 +- .../src/transformations/snippets/x64/op/brgemm_utils.cpp | 2 +- .../src/transformations/snippets/x64/op/brgemm_utils.hpp | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/common/snippets/include/snippets/pass/tokenization.hpp b/src/common/snippets/include/snippets/pass/tokenization.hpp index 8a1b293bf9a6af..ee9c25f05104f7 100644 --- a/src/common/snippets/include/snippets/pass/tokenization.hpp +++ b/src/common/snippets/include/snippets/pass/tokenization.hpp @@ -69,7 +69,7 @@ class SnippetsTokenization : public ov::pass::ModelPass { bool dyn_mha_token, std::set mha_transpose_ranks, ov::pass::param_callback mha_tokenize_mm_b_input_callback = nullptr) : m_concurrency(concurrency), m_data_ptr_gpr_count(data_ptr_gpr_count), m_split_m_dimension(split_m_dimension), m_mha_token_enable_transpose_on_output(enable_transpose_on_output), m_is_dynamic_mha_token_enabled(dyn_mha_token), - m_mha_supported_transpose_ranks(std::move(mha_transpose_ranks)), m_mha_tokenize_mm_b_input_callback(mha_tokenize_mm_b_input_callback) { + m_mha_supported_transpose_ranks(std::move(mha_transpose_ranks)), m_mha_tokenize_mm_b_input_callback(std::move(mha_tokenize_mm_b_input_callback)) { OPENVINO_ASSERT(concurrency > 0, "Concurrency should be greater than 0"); OPENVINO_ASSERT(data_ptr_gpr_count > 0, "data_ptr_gpr_count should be greater than 0"); } diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 773e54458be645..7edb916d8154b0 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -118,7 +118,7 @@ class RuntimeConfigurator { static UnifiedLoopInfoRtParams get_loop_runtime_params(const lowered::UnifiedLoopInfoPtr& unified_loop_info); using LoopInfoRuntimeParamsMap = std::unordered_map; /** - * @brief Update Loop informations in LinearIR: Unified and ExpandedLoopInfo + * @brief Update Loop information in LinearIR: Unified and ExpandedLoopInfo * @param linear_ir LinearIR */ static void update_loop_info(const lowered::LinearIRCPtr& linear_ir); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp index 424430cdc192f0..2982fd7767486f 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp @@ -87,7 +87,7 @@ size_t compute_inner_n_block(const ov::element::Type& precision) { } } -const ov::snippets::lowered::ExpressionPtr get_copy_b_expr(const ov::snippets::lowered::ExpressionPtr& brgemm_expr) { +ov::snippets::lowered::ExpressionPtr get_copy_b_expr(const ov::snippets::lowered::ExpressionPtr& brgemm_expr) { OPENVINO_ASSERT(ov::is_type(brgemm_expr->get_node()), "get_copy_b_expr must be called only for BrgemmCPU node"); const auto b_input_expr = brgemm_expr->get_input_port_connector(1)->get_source().get_expr(); if (ov::is_type(b_input_expr->get_node())) { diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp index a56d4d23672001..d15a76c5e4f15d 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp @@ -57,11 +57,11 @@ T compute_LDB(T n_block, const ov::element::Type& precision) { std::max(n_block, static_cast(compute_inner_n_block(precision))); } /** - * @brief Retrieves the expression pointer for the brgemm_copy_b emitter corresponding to the given BrgemmCPU expression. + * @brief Retrieves the expression pointer for the brgemm_copy_b expression corresponding to the given BrgemmCPU expression. * @param brgemm_expr The expression pointer for the BrgemmCPU operation. * @return The expression pointer for the BrgemmCopyB operation. */ -const snippets::lowered::ExpressionPtr get_copy_b_expr(const snippets::lowered::ExpressionPtr& brgemm_expr); +snippets::lowered::ExpressionPtr get_copy_b_expr(const snippets::lowered::ExpressionPtr& brgemm_expr); } // namespace repacking } // namespace brgemm_utils } // namespace intel_cpu From 2ad10d276d2284903dda8d20c6fe772090efbc70 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Wed, 20 Nov 2024 20:19:00 +0100 Subject: [PATCH 35/42] Rest review comments --- .../pass/mha_parallel_wa_optimizer.hpp | 3 +- .../lowered/pass/runtime_optimizer.hpp | 25 ++++++++- .../include/snippets/runtime_configurator.hpp | 2 +- .../pass/mha_parallel_wa_optimizer.cpp | 5 +- .../snippets/src/runtime_configurator.cpp | 5 +- .../snippets/cpu_runtime_configurator.cpp | 7 +-- .../snippets/cpu_runtime_configurator.hpp | 2 +- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 23 ++++---- src/plugins/intel_cpu/src/nodes/subgraph.h | 4 +- .../snippets/x64/op/brgemm_utils.cpp | 2 +- .../snippets/x64/op/brgemm_utils.hpp | 2 +- ...ng_out.cpp => eliminate_brgemm_copy_b.cpp} | 12 ++--- .../x64/pass/eliminate_brgemm_copy_b.hpp | 29 ++++++++++ .../adjust_brgemm_copy_b_loop_ports.cpp | 53 +++++++------------ .../brgemm_copy_b_loop_ports_adjuster.cpp | 5 +- .../brgemm_copy_b_loop_ports_adjuster.hpp | 3 +- .../x64/pass/lowered/brgemm_cpu_blocking.cpp | 15 +++--- .../lowered/external_repacking_adjuster.cpp | 5 +- .../lowered/external_repacking_adjuster.hpp | 3 +- .../x64/pass/move_brgemm_repacking_out.hpp | 22 -------- .../transformation_pipeline.cpp | 5 +- 21 files changed, 123 insertions(+), 109 deletions(-) rename src/plugins/intel_cpu/src/transformations/snippets/x64/pass/{move_brgemm_repacking_out.cpp => eliminate_brgemm_copy_b.cpp} (83%) create mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp delete mode 100644 src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.hpp diff --git a/src/common/snippets/include/snippets/lowered/pass/mha_parallel_wa_optimizer.hpp b/src/common/snippets/include/snippets/lowered/pass/mha_parallel_wa_optimizer.hpp index af1e2b60e1d70a..9af247cd52ecab 100644 --- a/src/common/snippets/include/snippets/lowered/pass/mha_parallel_wa_optimizer.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/mha_parallel_wa_optimizer.hpp @@ -24,9 +24,10 @@ namespace pass { class MHAParallelWAOptimizer : public lowered::pass::RuntimeOptimizer { public: MHAParallelWAOptimizer() = default; - MHAParallelWAOptimizer(const lowered::LinearIRCPtr& linear_ir, RuntimeConfigurator* configurator); + MHAParallelWAOptimizer(const lowered::LinearIRCPtr& linear_ir, const RuntimeConfigurator* configurator); bool run(const lowered::LinearIR& linear_ir) override; + bool applicable() const override { return !m_loops_to_split.empty(); } private: static std::unordered_set find_applicable_brgemms(const lowered::LinearIRCPtr& linear_ir); diff --git a/src/common/snippets/include/snippets/lowered/pass/runtime_optimizer.hpp b/src/common/snippets/include/snippets/lowered/pass/runtime_optimizer.hpp index 99522628e23c07..ed37a1c6c58bca 100644 --- a/src/common/snippets/include/snippets/lowered/pass/runtime_optimizer.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/runtime_optimizer.hpp @@ -20,9 +20,30 @@ namespace pass { class RuntimeOptimizer : public ConstPass { public: RuntimeOptimizer() = default; - RuntimeOptimizer(RuntimeConfigurator* configurator) : m_configurator(configurator) {} + RuntimeOptimizer(const RuntimeConfigurator* configurator) : m_configurator(configurator) { + OPENVINO_ASSERT(configurator, "RuntimeConfigurator musn't be nullptr"); + } + /** + * @brief Defines if this pass is applicable. If it is not applicable, its registration in pass pipeline can be skipped. + */ + virtual bool applicable() const = 0; + + /** + * @brief Creates an instance of the specified pass type and checks if it is applicable. + * If the pass is applicable, it is registered in the provided pipeline. + * @param pipeline The pipeline in which the pass should be registered. + * @param args The arguments to be forwarded to the pass constructor. + */ + template ::value>> + static void register_if_applicable(PassPipeline& pipeline, Args&&... args) { + auto pass = std::make_shared(std::forward(args)...); + if (pass->applicable()) { + pipeline.register_pass(pass); + } + } + protected: - RuntimeConfigurator* m_configurator = nullptr; + const RuntimeConfigurator* m_configurator = nullptr; }; } // namespace pass diff --git a/src/common/snippets/include/snippets/runtime_configurator.hpp b/src/common/snippets/include/snippets/runtime_configurator.hpp index 7edb916d8154b0..866e98843fcd50 100644 --- a/src/common/snippets/include/snippets/runtime_configurator.hpp +++ b/src/common/snippets/include/snippets/runtime_configurator.hpp @@ -133,7 +133,7 @@ class RuntimeConfigurator { * @brief Update tensor rank based on master shape * @param master_shape Master shape */ - virtual void update_tensor_rank(const ov::snippets::VectorDims& master_shape); + virtual void update_tensor_rank(const ov::snippets::VectorDims& master_shape) const; protected: /** diff --git a/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp b/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp index 7c4c3085679d6b..2f57d6422cf11d 100644 --- a/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp +++ b/src/common/snippets/src/lowered/pass/mha_parallel_wa_optimizer.cpp @@ -19,7 +19,7 @@ using namespace ov::snippets::pass; const size_t MHAParallelWAOptimizer::m_dim_M_idx = 1; -MHAParallelWAOptimizer::MHAParallelWAOptimizer(const lowered::LinearIRCPtr& linear_ir, RuntimeConfigurator* configurator) +MHAParallelWAOptimizer::MHAParallelWAOptimizer(const lowered::LinearIRCPtr& linear_ir, const RuntimeConfigurator* configurator) : lowered::pass::RuntimeOptimizer(configurator) { if (linear_ir->get_config().m_enable_domain_optimization || !linear_ir->is_dynamic()) return; @@ -47,9 +47,6 @@ MHAParallelWAOptimizer::MHAParallelWAOptimizer(const lowered::LinearIRCPtr& line bool MHAParallelWAOptimizer::run(const lowered::LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::MHAParallelWAOptimizer") - if (m_loops_to_split.empty()) - return false; - const auto& config = m_configurator->get_config(); size_t new_batch_dim, new_kernel_dim; if (!SplitDimensionM::split(config->master_shape, m_concurrency, new_batch_dim, new_kernel_dim)) diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 5a19c61767a22c..41cfdd7d6df381 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -17,6 +17,7 @@ namespace snippets { using namespace ov::snippets::pass; using namespace ov::snippets::lowered; +using namespace ov::snippets::lowered::pass; #ifdef SNIPPETS_DEBUG_CAPS std::string RuntimeConfig::to_string() const { @@ -65,7 +66,7 @@ void RuntimeConfigurator::initialization(const lowered::LinearIRCPtr& linear_ir) m_config->tile_rank = linear_ir->get_config().m_loop_depth; if (linear_ir->is_dynamic()) - m_intermediate_optimizers.register_pass(linear_ir, this); + RuntimeOptimizer::register_if_applicable(m_intermediate_optimizers, linear_ir, this); } void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { @@ -86,7 +87,7 @@ void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { m_config->latest_shapes = std::move(m_config->io_shapes); } -void RuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { +void RuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) const { m_config->tensor_rank = master_shape.size(); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index d4be7235131ead..283b5bf621b85f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -13,6 +13,7 @@ #endif namespace ov { namespace intel_cpu { +using namespace ov::snippets::lowered::pass; const size_t CPURuntimeConfigurator::rank6D = 6; @@ -44,8 +45,8 @@ void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearI RuntimeConfigurator::initialization(linear_ir); #ifndef OPENVINO_ARCH_ARM64 if (linear_ir->is_dynamic()) - m_intermediate_optimizers.register_pass(linear_ir, this); - m_final_optimizers.register_pass(linear_ir, this); + RuntimeOptimizer::register_if_applicable(m_intermediate_optimizers, linear_ir, this); + RuntimeOptimizer::register_if_applicable(m_final_optimizers, linear_ir, this); #endif } @@ -72,7 +73,7 @@ void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& l m_config->latest_shapes = std::move(m_config->io_shapes); } -void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) { +void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) const { m_config->tensor_rank = std::max(master_shape.size(), rank6D); } diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index f36c3b28de1fe1..42ce35a3c66c2b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -36,7 +36,7 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; protected: void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; - void update_tensor_rank(const ov::snippets::VectorDims& master_shape) override; + void update_tensor_rank(const ov::snippets::VectorDims& master_shape) const override; void init_tensor_rank(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const override; void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index aaa12d303bb232..f56a7f27257ad4 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -35,7 +35,7 @@ #include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp" #include "transformations/snippets/x64/pass/remove_converts.hpp" #include "transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp" -#include "transformations/snippets/x64/pass/move_brgemm_repacking_out.hpp" +#include "transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp" #include "transformations/snippets/x64/pass/enforce_precision.hpp" #include "transformations/snippets/x64/shape_inference.hpp" #include "transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.hpp" @@ -650,7 +650,7 @@ Subgraph::DataFlowPasses Subgraph::getDataFlowPasses() { SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::Before, ov::snippets::pass::PropagatePrecision, ov::intel_cpu::pass::BrgemmToBrgemmCPU); SNIPPETS_REGISTER_PASS_RELATIVE_X86_64(Place::After, ov::intel_cpu::pass::BrgemmToBrgemmCPU, - ov::intel_cpu::pass::MoveBrgemmRepackingOut); + ov::intel_cpu::pass::EliminateBrgemmCopyB); SNIPPETS_REGISTER_PASS_ABSOLUTE_X86_64(Place::PipelineEnd, ov::intel_cpu::pass::RemoveConverts); SNIPPETS_REGISTER_PASS_ABSOLUTE_COMMON(Place::PipelineEnd, ov::intel_cpu::pass::MulAddToFMA); @@ -992,14 +992,17 @@ void Subgraph::SubgraphExecutor::parallel_forNd(const std::function& inMemPtrs, std::vector& outMemPtrs) { - if (m_in_requested_descs.empty()) +void Subgraph::SubgraphExecutor::execute(dnnl::stream strm, const std::vector& inMemPtrs, const std::vector& outMemPtrs) { + if (!m_in_requested_descs.empty()) { + auto reorderedInMemPtrs = exec_in_reorders(strm, inMemPtrs); + exec_impl(reorderedInMemPtrs, outMemPtrs); + } else { exec_impl(inMemPtrs, outMemPtrs); - else - reorder_execute(strm, inMemPtrs, outMemPtrs); + } } -void Subgraph::SubgraphExecutor::reorder_execute(dnnl::stream strm, std::vector inMemPtrs, const std::vector& outMemPtrs) { +std::vector Subgraph::SubgraphExecutor::exec_in_reorders(dnnl::stream strm, const std::vector& inMemPtrs) { + auto reordered_in_ptrs = inMemPtrs; size_t offset = m_internal_buffer_size; for (const auto& requested_descs_elem : m_in_requested_descs) { const auto in_idx = requested_descs_elem.first; @@ -1007,11 +1010,11 @@ void Subgraph::SubgraphExecutor::reorder_execute(dnnl::stream strm, std::vector< const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; const auto scratch_mem = std::make_shared(strm.get_engine(), requested_desc, data_ptr, false); - scratch_mem->load(*inMemPtrs[in_idx]); - inMemPtrs[in_idx] = scratch_mem; + scratch_mem->load(*reordered_in_ptrs[in_idx]); + reordered_in_ptrs[in_idx] = scratch_mem; offset += requested_desc->getCurrentMemSize(); } - exec_impl(inMemPtrs, outMemPtrs); + return reordered_in_ptrs; } } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 0cc5258f3d18e7..cf907349bda25b 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -129,7 +129,7 @@ class Subgraph::SubgraphExecutor { const BufferScratchpadAllocator& allocator); virtual ~SubgraphExecutor() = default; - void execute(dnnl::stream strm, std::vector& inMemPtrs, std::vector& outMemPtrs); + void execute(dnnl::stream strm, const std::vector& inMemPtrs, const std::vector& outMemPtrs); protected: virtual void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; @@ -169,7 +169,7 @@ class Subgraph::SubgraphExecutor { #endif private: - void reorder_execute(dnnl::stream strm, std::vector inMemPtrs, const std::vector& outMemPtrs); + std::vector exec_in_reorders(dnnl::stream strm, const std::vector& inMemPtrs); std::unordered_map m_in_requested_descs = {}; }; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp index 2982fd7767486f..6a4fc83d409355 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp @@ -95,7 +95,7 @@ ov::snippets::lowered::ExpressionPtr get_copy_b_expr(const ov::snippets::lowered } else if (ov::is_type(b_input_expr)) { OPENVINO_ASSERT(b_input_expr->get_input_count() >= 1, "BufferExpression on brgemm's B input must have at least one input"); const auto input_buffer_expr = b_input_expr->get_input_port_connector(0)->get_source().get_expr(); - if (ov::is_type(b_input_expr->get_node())) { + if (ov::is_type(input_buffer_expr->get_node())) { return input_buffer_expr; } } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp index d15a76c5e4f15d..0d8e3f5fb6fc9b 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp @@ -18,7 +18,7 @@ enum class BRGEMM_TYPE { STAND_ALONE, // No extra requirements, used for f32|f32 WITH_AMX, // i8|i8 or bf16|bf16 on AMX system - needs BrgemmCopyB and scratchpad WITH_COMPENSATIONS, // i8|i8 (non-AMX system) - needs BrgemmCopyB for data repacking and compensations - REPACKING_ONLY, // low precision or some specific f32 cases - needs BrgemmCopyB on second input for data repacking + REPACKING_ONLY, // u8|i8, or bf16|bf16 (non-AMX system), or brgemm with transpose_b=true - needs BrgemmCopyB on second input for data repacking }; dnnl::impl::cpu::x64::cpu_isa_t get_primitive_isa(const ov::element::Type& dt_in0, bool is_with_amx); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp similarity index 83% rename from src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp rename to src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp index a6973492f7d95c..4ad2bb8a11a667 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // -#include "move_brgemm_repacking_out.hpp" +#include "eliminate_brgemm_copy_b.hpp" #include "cpu/x64/cpu_isa_traits.hpp" #include "openvino/pass/pattern/matcher.hpp" @@ -15,18 +15,18 @@ namespace ov { namespace intel_cpu { -pass::MoveBrgemmRepackingOut::MoveBrgemmRepackingOut() { - MATCHER_SCOPE(MoveBrgemmRepackingOut); +pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() { + MATCHER_SCOPE(EliminateBrgemmCopyB); auto m_param = ov::pass::pattern::wrap_type(); auto m_rank_norm = ov::pass::pattern::optional(m_param); auto m_copy_b = ov::pass::pattern::wrap_type({m_param}); auto callback = [=](ov::pass::pattern::Matcher& m) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::MoveBrgemmRepackingOut") + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "ov::intel_cpu::pass::EliminateBrgemmCopyB") const auto& pattern_map = m.get_pattern_value_map(); const auto& copy_b_out = pattern_map.at(m_copy_b); const auto copy_b_node = ov::as_type_ptr(copy_b_out.get_node_shared_ptr()); - OPENVINO_ASSERT(copy_b_node, "BrgemmCopyB node is null in MoveBrgemmRepackingOut transformation"); + OPENVINO_ASSERT(copy_b_node, "BrgemmCopyB node is null in EliminateBrgemmCopyB transformation"); const auto& in_desc = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(copy_b_node->input(0)); const auto& layout = in_desc->get_layout(); @@ -34,7 +34,7 @@ pass::MoveBrgemmRepackingOut::MoveBrgemmRepackingOut() { // 1. Ticket 157340: support external repacking for copyB with compensations // 2. Ticket 157339: support external repacking for non-planar layout if (!ov::snippets::utils::is_planar_layout(layout) || - copy_b_node->get_src_element_type() == ov::element::i8 || transformation_callback(copy_b_node)) + brgemm_utils::with_compensations(copy_b_node->get_type()) || transformation_callback(copy_b_node)) return false; return ov::replace_output_update_name(copy_b_out, copy_b_node->input_value(0)); }; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp new file mode 100644 index 00000000000000..2cdeae53fab026 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/pass/graph_rewrite.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { + +/** + * @interface EliminateBrgemmCopyB + * @brief EliminateBrgemmCopyB identifies BrgemmCopyB nodes which can be inferred outside the Subgraph. + * If this is possible, CopyB node is removed, and the external repacking is configured on the further pipeline stages in RuntimeConfigurator. + * + * @ingroup snippets + */ +class EliminateBrgemmCopyB: public ov::pass::MatcherPass { +public: + OPENVINO_RTTI("EliminateBrgemmCopyB", "0"); + EliminateBrgemmCopyB(); +}; + + +} // namespace pass +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp index 8d734e288514bf..7dfe711a5a5c67 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp @@ -65,17 +65,12 @@ bool pass::AdjustBrgemmCopyBLoopPorts::run(const snippets::lowered::LinearIR& li bool modified = false; - auto get_repacking_loop_idces = [](const snippets::lowered::ExpressionPtr& parent_expr) { + auto get_repacking_loop_idces = [](const snippets::lowered::ExpressionPtr& brgemm_expr) { // Repacking may be extracted outside the snippets kernel. In this case, brgemm parent expression is a parameter. - if (is_type(parent_expr->get_node())) + if (is_type(brgemm_expr->get_input_port_connector(1)->get_source().get_expr()->get_node())) return std::vector{}; - - OPENVINO_ASSERT(is_type(parent_expr), - "In case of repacking brgemm expr must have BufferExpression on B input"); - const auto buffer_parent_ports = parent_expr->get_input_port(0).get_connected_ports(); - OPENVINO_ASSERT(buffer_parent_ports.size() == 1, - "Parent of brgemm repacking buffer must be connected only to the buffer"); - const auto& repacking_expr = buffer_parent_ports.begin()->get_expr(); + const auto repacking_expr = brgemm_utils::repacking::get_copy_b_expr(brgemm_expr); + OPENVINO_ASSERT(repacking_expr, "BrgemmCopyB expression is not found"); return repacking_expr->get_loop_ids(); }; @@ -83,30 +78,22 @@ bool pass::AdjustBrgemmCopyBLoopPorts::run(const snippets::lowered::LinearIR& li const auto brgemm = ov::as_type_ptr(expr->get_node()); if (!brgemm || !brgemm_utils::with_repacking(brgemm->get_type())) continue; - const auto& parent_expr = expr->get_input_port_connector(1)->get_source().get_expr(); - const auto& repacking_loop_ids = get_repacking_loop_idces(parent_expr); - for (const auto& target_port : parent_expr->get_output_port(0).get_connected_ports()) { - const auto& port_node = target_port.get_expr()->get_node(); - if (!is_type(port_node)) { - OPENVINO_ASSERT(is_type(port_node), - "Invalid grandchild of BrgemmCopyB"); - continue; - } - const auto &brgemm_loop_ids = target_port.get_expr()->get_loop_ids(); - // Continue if there is no blocking loop - if (brgemm_loop_ids.empty() && repacking_loop_ids.empty()) - continue; - OPENVINO_ASSERT(brgemm_loop_ids.size() > repacking_loop_ids.size(), "Invalid BrgemmCopyB loop configuration"); - const auto &loop_manager = linear_ir.get_loop_manager(); - for (auto i = repacking_loop_ids.size(); i < brgemm_loop_ids.size(); i++) { - const auto &loop = loop_manager->get_loop_info(brgemm_loop_ids[i]); - auto uni_loop = ov::as_type_ptr(loop); - if (!uni_loop) - uni_loop = ov::as_type_ptr(loop)->get_unified_loop_info(); - if (!m_affected_loops.count(uni_loop) && update_loop_info(uni_loop)) { - m_affected_loops.insert(uni_loop); - modified = true; - } + const auto& brgemm_loop_ids = expr->get_loop_ids(); + const auto& repacking_loop_ids = get_repacking_loop_idces(expr); + // Continue if there is no blocking loop + if (brgemm_loop_ids.empty() && repacking_loop_ids.empty()) + continue; + + OPENVINO_ASSERT(brgemm_loop_ids.size() > repacking_loop_ids.size(), "Invalid BrgemmCopyB loop configuration"); + const auto &loop_manager = linear_ir.get_loop_manager(); + for (auto i = repacking_loop_ids.size(); i < brgemm_loop_ids.size(); i++) { + const auto &loop = loop_manager->get_loop_info(brgemm_loop_ids[i]); + auto uni_loop = ov::as_type_ptr(loop); + if (!uni_loop) + uni_loop = ov::as_type_ptr(loop)->get_unified_loop_info(); + if (!m_affected_loops.count(uni_loop) && update_loop_info(uni_loop)) { + m_affected_loops.insert(uni_loop); + modified = true; } } } diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp index 089d91aba809fb..509f9ecf149c8e 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp @@ -12,7 +12,7 @@ namespace ov { namespace intel_cpu { BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, - CPURuntimeConfigurator* configurator) + const CPURuntimeConfigurator* configurator) : ov::snippets::lowered::pass::RuntimeOptimizer(configurator) { const auto& pass = std::make_shared(); pass->run(*linear_ir); @@ -29,9 +29,6 @@ BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::l bool BrgemmCopyBLoopPortsAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmCopyBLoopPortsAdjuster") - if (m_affected_uni2exp_map.empty()) - return false; - for (const auto& p : m_affected_uni2exp_map) { const auto& uni_loop = p.first; const auto& exp_loops = p.second; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp index c33cb0d502f19f..7b9f30ac96e4b1 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp @@ -19,9 +19,10 @@ namespace intel_cpu { class BrgemmCopyBLoopPortsAdjuster : public ov::snippets::lowered::pass::RuntimeOptimizer { public: BrgemmCopyBLoopPortsAdjuster() = default; - BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator); + BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, const CPURuntimeConfigurator* configurator); bool run(const snippets::lowered::LinearIR& linear_ir) override; + bool applicable() const override { return !m_affected_uni2exp_map.empty(); } private: std::unordered_mapget_input_port_descriptor(0)->set_subtensor({get_full_dim_value(), get_full_dim_value()}); - copy_b_expr->get_output_port_descriptor(0)->set_subtensor({get_full_dim_value(), get_full_dim_value()}); - if (with_compensations(type)) { - const ov::snippets::VectorDims compensations_subtensor{1, get_full_dim_value()}; - OPENVINO_ASSERT(brgemm_expr->get_input_count() == 3, "Brgemm must have 3 inputs in case of compensations."); - brgemm_expr->get_input_port_descriptor(2)->set_subtensor(compensations_subtensor); - copy_b_expr->get_output_port_descriptor(1)->set_subtensor(compensations_subtensor); - } + const ov::snippets::VectorDims full_subtensor(2, get_full_dim_value()); + copy_b_expr->get_input_port_descriptor(0)->set_subtensor(full_subtensor); + copy_b_expr->get_output_port_descriptor(0)->set_subtensor(full_subtensor); } if (with_amx(type)) { move_new_memory_buffer(linear_ir, brgemm_it); @@ -102,8 +97,12 @@ bool BrgemmCPUBlocking::mark_blocking_loops(LinearIR& linear_ir, const auto& loop_manager = linear_ir.get_loop_manager(); if (with_compensations(type)) { + const ov::snippets::VectorDims compensations_subtensor{1, get_full_dim_value()}; OPENVINO_ASSERT(brgemm_expr->get_input_count() == 3, "Brgemm must have 3 inputs in case of compensations."); + OPENVINO_ASSERT(copy_b_expr, "BrgemmCopyB must be present in case of compensations."); const auto& compens_port = brgemm_expr->get_input_port(2); + compens_port.get_descriptor_ptr()->set_subtensor(compensations_subtensor); + copy_b_expr->get_output_port_descriptor(1)->set_subtensor(compensations_subtensor); const auto& loop_ids = brgemm_expr->get_loop_ids(); size_t i = 0; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp index 327d82761ad566..e98c8ebbecf49b 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp @@ -15,7 +15,7 @@ namespace ov { namespace intel_cpu { BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, - CPURuntimeConfigurator* configurator) + const CPURuntimeConfigurator* configurator) : snippets::lowered::pass::RuntimeOptimizer(configurator) { const auto& params = linear_ir->get_parameters(); for (size_t i = 0; i < params.size(); ++i) { @@ -37,9 +37,6 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmExternalRepackingAdjuster") - if (m_param_idces_with_external_repacking.empty()) - return false; - const auto& cpu_config = ov::as_type_ptr(m_configurator->get_config()); auto& optimal_descs = cpu_config->m_in_requested_descs; for (const auto& i : m_param_idces_with_external_repacking) { diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp index fb22beaca63ae1..f102af8f23fe5b 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp @@ -19,9 +19,10 @@ namespace intel_cpu { class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::RuntimeOptimizer { public: BrgemmExternalRepackingAdjuster() = default; - BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, CPURuntimeConfigurator* configurator); + BrgemmExternalRepackingAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, const CPURuntimeConfigurator* configurator); bool run(const snippets::lowered::LinearIR& linear_ir) override; + bool applicable() const override { return !m_param_idces_with_external_repacking.empty(); } private: std::set m_param_idces_with_external_repacking; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.hpp deleted file mode 100644 index c82193c93f1d4b..00000000000000 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/move_brgemm_repacking_out.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "openvino/pass/graph_rewrite.hpp" - -namespace ov { -namespace intel_cpu { -namespace pass { - -class MoveBrgemmRepackingOut: public ov::pass::MatcherPass { -public: - OPENVINO_RTTI("MoveBrgemmRepackingOut", "0"); - MoveBrgemmRepackingOut(); -}; - - -} // namespace pass -} // namespace intel_cpu -} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index 05dfb6a377ec91..e67fbc238a8e10 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -954,8 +954,9 @@ void Transformations::MainSnippets(void) { // [122706] Some 3D MHA Patterns have perf regressions when Transpose op is tokenized std::set mha_supported_transpose_ranks = { 4 }; - // Note: this is a temporary WA, avoiding matmul B input tokenization in the cases when CPU . - // It will be removed when plugin specific SubgraphPass will be implemented. + // If preliminary repacking is needed, it is executed outside the snippets kernel for performance reasons, + // so tokenization of ops sequences on matmul's B input is disabled + // Ticket 157743: This logic should be placed in CPU specific SubgraphPass. auto mha_tokenize_mm_b_input_callback = [this](const std::shared_ptr& node) { const auto& input_type_0 = node->get_input_element_type(0); const auto& input_type_1 = node->get_input_element_type(1); From 11865aaf2c094fe2c36dc65483048abeddcf0f05 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 21 Nov 2024 13:32:22 +0100 Subject: [PATCH 36/42] Further refactoring in accordance to review suggestions --- .../snippets/src/runtime_configurator.cpp | 13 +++++----- .../snippets/cpu_runtime_configurator.cpp | 24 +++---------------- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 6 ++--- src/plugins/intel_cpu/src/nodes/subgraph.h | 4 ++-- .../brgemm_copy_b_loop_ports_adjuster.cpp | 3 +++ 5 files changed, 18 insertions(+), 32 deletions(-) diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 41cfdd7d6df381..96d13074d042ba 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -52,6 +52,8 @@ const std::shared_ptr& RuntimeConfigurator::get_updated_config(co initialization(linear_ir); update(linear_ir); + // Note: after 'update' is finished, io_shapes can be corrupted, so we move it to latest_shapes to avoid copying + m_config->latest_shapes = std::move(m_config->io_shapes); return m_config; } @@ -65,26 +67,25 @@ void RuntimeConfigurator::initialization(const lowered::LinearIRCPtr& linear_ir) m_config->io_data_offsets.resize(m_io_num); m_config->tile_rank = linear_ir->get_config().m_loop_depth; - if (linear_ir->is_dynamic()) - RuntimeOptimizer::register_if_applicable(m_intermediate_optimizers, linear_ir, this); + RuntimeOptimizer::register_if_applicable(m_intermediate_optimizers, linear_ir, this); } void RuntimeConfigurator::update(const lowered::LinearIRCPtr& linear_ir) { m_config->master_shape = linear_ir->get_master_shape(); m_config->io_shapes = extract_shapes(); m_config->io_layouts = extract_layouts(); - update_loop_info(linear_ir); + if (linear_ir->is_dynamic()) + update_loop_info(linear_ir); m_intermediate_optimizers.run(*linear_ir); - update_data_offsets(); - // Update KernelExecutor Table should be before `update_buffer_scratchpad_size` // because `ComputeAllocationSize` depends on subtensors which are updated in the table get_kernel_executor_table()->update_state(linear_ir); update_buffer_scratchpad_size(linear_ir); + + update_data_offsets(); m_final_optimizers.run(*linear_ir); - m_config->latest_shapes = std::move(m_config->io_shapes); } void RuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) const { diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 283b5bf621b85f..b2758735b2d27a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -44,33 +44,15 @@ CPURuntimeConfigurator::CPURuntimeConfigurator() : ov::snippets::RuntimeConfigur void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { RuntimeConfigurator::initialization(linear_ir); #ifndef OPENVINO_ARCH_ARM64 - if (linear_ir->is_dynamic()) - RuntimeOptimizer::register_if_applicable(m_intermediate_optimizers, linear_ir, this); + RuntimeOptimizer::register_if_applicable(m_intermediate_optimizers, linear_ir, this); RuntimeOptimizer::register_if_applicable(m_final_optimizers, linear_ir, this); #endif } void CPURuntimeConfigurator::update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { - m_config->master_shape = linear_ir->get_master_shape(); - m_config->io_shapes = extract_shapes(); - m_config->io_layouts = extract_layouts(); - if (linear_ir->is_dynamic()) { - update_loop_info(linear_ir); - } - - m_intermediate_optimizers.run(*linear_ir); - - // Update KernelExecutor Table should be before `update_buffer_scratchpad_size` - // because `ComputeAllocationSize` depends on subtensors which are updated in the table - get_kernel_executor_table()->update_state(linear_ir); - update_buffer_scratchpad_size(linear_ir); - - if (linear_ir->is_dynamic()) { + RuntimeConfigurator::update(linear_ir); + if (linear_ir->is_dynamic()) update_loop_args(linear_ir); - } - update_data_offsets(); - m_final_optimizers.run(*linear_ir); - m_config->latest_shapes = std::move(m_config->io_shapes); } void CPURuntimeConfigurator::update_tensor_rank(const ov::snippets::VectorDims& master_shape) const { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index f56a7f27257ad4..a23835d398cbe7 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -992,16 +992,16 @@ void Subgraph::SubgraphExecutor::parallel_forNd(const std::function& inMemPtrs, const std::vector& outMemPtrs) { +void Subgraph::SubgraphExecutor::execute(const dnnl::stream& strm, const std::vector& inMemPtrs, const std::vector& outMemPtrs) { if (!m_in_requested_descs.empty()) { - auto reorderedInMemPtrs = exec_in_reorders(strm, inMemPtrs); + auto reorderedInMemPtrs = reorder_inputs(strm, inMemPtrs); exec_impl(reorderedInMemPtrs, outMemPtrs); } else { exec_impl(inMemPtrs, outMemPtrs); } } -std::vector Subgraph::SubgraphExecutor::exec_in_reorders(dnnl::stream strm, const std::vector& inMemPtrs) { +std::vector Subgraph::SubgraphExecutor::reorder_inputs(const dnnl::stream& strm, const std::vector& inMemPtrs) { auto reordered_in_ptrs = inMemPtrs; size_t offset = m_internal_buffer_size; for (const auto& requested_descs_elem : m_in_requested_descs) { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index cf907349bda25b..8040da0a98ef57 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -129,7 +129,7 @@ class Subgraph::SubgraphExecutor { const BufferScratchpadAllocator& allocator); virtual ~SubgraphExecutor() = default; - void execute(dnnl::stream strm, const std::vector& inMemPtrs, const std::vector& outMemPtrs); + void execute(const dnnl::stream& strm, const std::vector& inMemPtrs, const std::vector& outMemPtrs); protected: virtual void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; @@ -169,7 +169,7 @@ class Subgraph::SubgraphExecutor { #endif private: - std::vector exec_in_reorders(dnnl::stream strm, const std::vector& inMemPtrs); + std::vector reorder_inputs(const dnnl::stream& strm, const std::vector& inMemPtrs); std::unordered_map m_in_requested_descs = {}; }; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp index 509f9ecf149c8e..d88e0660e9e6fb 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.cpp @@ -14,6 +14,9 @@ namespace intel_cpu { BrgemmCopyBLoopPortsAdjuster::BrgemmCopyBLoopPortsAdjuster(const ov::snippets::lowered::LinearIRCPtr& linear_ir, const CPURuntimeConfigurator* configurator) : ov::snippets::lowered::pass::RuntimeOptimizer(configurator) { + if (!linear_ir->is_dynamic()) + return; + const auto& pass = std::make_shared(); pass->run(*linear_ir); const auto& affected_uni_loops = pass->get_affected_loops(); From 8a391f16b99eb7cb4a89ed6a8d7eeeca980d34a6 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 21 Nov 2024 16:33:00 +0100 Subject: [PATCH 37/42] Revert "Enable u8i8 and bf16 MHA tokenization with transpose_b=true" This reverts commit 1de39e898ca68a316cc36393be6115ebb695e97d. --- .../src/transformations/transformation_pipeline.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index e67fbc238a8e10..fc39b94fe1088d 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -1019,12 +1019,9 @@ void Transformations::MainSnippets(void) { // Only FP32 dynamic MHA is supported if (matmul->is_dynamic()) return false; - // Ticket 157340: repacking extraction is not supported for i8i8 case. - // If the repacking is performed inside the kernel, it may lead to performance degradation. - if (is_int8 && matmul->get_transpose_b()) - return false; - - if (matmul->get_transpose_a()) + // [114487] brgemm kernel in oneDNN requires brgemm_copy_b kernel if MatMul node has transposed_b=True + // The current solution with ExtractExplicitMatMulTranspose pass is slower for non-f32 cases than using of brgemm_copy_b kernel + if (matmul->get_transpose_a() || matmul->get_transpose_b()) return false; // [150842] The execution of Brgemm INT8/BF16 on AMX platforms depends on the value of "K % VNNIFactor". // For more details, please teake a look at the ticket 150842 From bbd607d7ed796ae74b475a42bfd4773c83522cf6 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 21 Nov 2024 16:33:14 +0100 Subject: [PATCH 38/42] Revert "[WIP] Change splitM heuristic" This reverts commit fb623305a53dc0306bf43d33ca816b4a19a7b15d. --- .../snippets/src/pass/split_dimension_m.cpp | 30 +++++++++---------- .../tests/src/pass/mha_tokenization.cpp | 12 ++++---- .../snippets/tests/src/utils/split_dim_m.cpp | 15 +++++----- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/src/common/snippets/src/pass/split_dimension_m.cpp b/src/common/snippets/src/pass/split_dimension_m.cpp index a263fb8de0a87a..0f50ad27931e04 100644 --- a/src/common/snippets/src/pass/split_dimension_m.cpp +++ b/src/common/snippets/src/pass/split_dimension_m.cpp @@ -34,23 +34,23 @@ bool SplitDimensionM::is_supported_matmul(const std::shared_ptr& std::pair SplitDimensionM::get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { std::pair splited = { 1, m_dim }; - // TODO: should we limit minimal kernel_m? - const size_t min_kernel_m = 4; - // Strategy 1: Find a combination such that (batch_dim * splited.first) % optimal_parallelism_work_amount == 0 - for (size_t divisor = 1; divisor <= m_dim; ++divisor) { - if (m_dim % divisor == 0) { - const auto m_batch = divisor; - const auto m_kernel = m_dim / divisor; - if (m_kernel < min_kernel_m) - break; - splited = { m_batch, m_kernel }; - if ((batch_dim * splited.first) % optimal_parallelism_work_amount == 0) { - OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); - return splited; - } - } + const size_t lower_bound = optimal_parallelism_work_amount / batch_dim; + if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0) { + splited.first = lower_bound; + splited.second = m_dim / lower_bound; + OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); + return splited; } + const size_t upper_bound = utils::div_up(2 * optimal_parallelism_work_amount, batch_dim); + for (size_t divisor_0 = upper_bound - 1; divisor_0 > 1; divisor_0--) { + size_t divisor_1 = m_dim / divisor_0; + if (divisor_1 * divisor_0 == m_dim) { + splited.first = divisor_0; + splited.second = divisor_1; + break; + } + } OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); return splited; } diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp index 9b8d5596cb2ef6..c5932ed690d670 100644 --- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp @@ -171,7 +171,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) { const auto& f = MHASplitMFunction(std::vector{{128, 12, 64}, {128, 12, 64}, {12, 128, 128}, {128, 12, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{4, 32, 12, 64}, {128, 12, 1, 64}, {12, 4, 32, 128}, {1, 128, 12, 64}, {128, 12, 64}}, + std::vector{{2, 64, 12, 64}, {128, 12, 1, 64}, {12, 2, 64, 128}, {1, 128, 12, 64}, {128, 12, 64}}, true); model = f.getOriginal(); model_ref = f.getReference(); @@ -182,7 +182,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA3D_SplitM_withMul) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) { const auto& f = MHASplitMFunction(std::vector{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{1, 96, 4, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, + std::vector{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, false); model = f.getOriginal(); model_ref = f.getReference(); @@ -193,7 +193,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM_withMul) { const auto& f = MHASplitMFunction(std::vector{{1, 384, 16, 64}, {1, 384, 16, 64}, {1, 1, 1, 384}, {1, 384, 16, 64}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{1, 96, 4, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, + std::vector{{1, 6, 64, 16, 64}, {1, 384, 16, 1, 64}, {1, 1, 1, 1, 384}, {1, 1, 384, 16, 64}, {1, 384, 16, 64}}, true); model = f.getOriginal(); model_ref = f.getReference(); @@ -204,7 +204,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM_withMul) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHAWOTranspose_SplitM) { const auto& f = MHAWOTransposeSplitMFunction(std::vector{{10, 9216, 128}, {10, 128, 9216}, {10, 9216, 128}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{10, 9, 1024, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}}); + std::vector{{10, 3, 3072, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}}); model = f.getOriginal(); model_ref = f.getReference(); config.set_concurrency(18); @@ -212,9 +212,9 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHAWOTranspose_SplitM) { } TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA_SplitM_AlmostAllThreads) { - const auto& f = MHAWOTransposeSplitMFunction(std::vector{{5, 60, 32}, {5, 32, 30}, {5, 30, 32}}, + const auto& f = MHAWOTransposeSplitMFunction(std::vector{{5, 30, 32}, {5, 32, 30}, {5, 30, 32}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{5, 15, 4, 32}, {5, 1, 32, 30}, {5, 1, 30, 32}, {5, 60, 32}}); + std::vector{{5, 10, 3, 32}, {5, 1, 32, 30}, {5, 1, 30, 32}, {5, 30, 32}}); model = f.getOriginal(); model_ref = f.getReference(); config.set_concurrency(32); diff --git a/src/common/snippets/tests/src/utils/split_dim_m.cpp b/src/common/snippets/tests/src/utils/split_dim_m.cpp index db574a38f54685..69a04da6f1263f 100644 --- a/src/common/snippets/tests/src/utils/split_dim_m.cpp +++ b/src/common/snippets/tests/src/utils/split_dim_m.cpp @@ -48,15 +48,16 @@ TEST_P(SplitDimensionMTest, SplitDimensionM) { namespace SplitDimensionMInstantiation { const std::vector split_dimension_cases = { // Negative test cases: split is not needed - {InputData{32 /*cur_batch*/, 32 /*cur_m*/, 32 /*concurrency*/}, ReferenceData{false /*is_split*/}}, - {InputData{50, 32, 32}, ReferenceData{false}}, + {InputData{40 /*cur_batch*/, 32 /*cur_m*/, 40 /*concurrency*/}, ReferenceData{false /*is_split*/}}, + {InputData{65, 32, 40}, ReferenceData{false}}, // Positive test cases - {InputData{20 /*cur_batch*/, 32 /*cur_m*/, 32 /*concurrency*/}, ReferenceData{true /*is_split*/, 8 /*batch_m*/, 4 /*kernel_m*/}}, - {InputData{16, 60, 32}, ReferenceData{true, 2, 30}}, - {InputData{10, 100, 32}, ReferenceData{true, 25, 4}}, - {InputData{25, 50, 32}, ReferenceData{true, 10, 5}}, - {InputData{5, 16384, 32}, ReferenceData{true, 32, 512}}, + {InputData{20 /*cur_batch*/, 32 /*cur_m*/, 40 /*concurrency*/}, ReferenceData{true /*is_split*/, 2 /*batch_m*/, 16 /*kernel_m*/}}, + {InputData{30, 60, 40}, ReferenceData{true, 2, 30}}, + {InputData{10, 100, 40}, ReferenceData{true, 4, 25}}, + {InputData{15, 45, 40}, ReferenceData{true, 5, 9}}, + {InputData{25, 50, 40}, ReferenceData{true, 2, 25}}, + {InputData{5, 16384, 40}, ReferenceData{true, 8, 2048}}, }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SplitDimensionM, From 504dbb2ae121cff12f5a88b0e82360290199643e Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 21 Nov 2024 16:34:22 +0100 Subject: [PATCH 39/42] Revert "Correct MHA tokenization" This reverts commit e1c3ed7456adebeec3bd64a0d9ea6f5d6be14ecc. --- .../include/snippets/pass/tokenization.hpp | 10 ++-------- .../snippets/src/pass/mha_tokenization.cpp | 16 +++++---------- .../transformation_pipeline.cpp | 15 +------------- .../custom/subgraph_tests/src/x64/mha.cpp | 20 ++++--------------- .../snippets/mha_quantized.cpp | 2 +- 5 files changed, 13 insertions(+), 50 deletions(-) diff --git a/src/common/snippets/include/snippets/pass/tokenization.hpp b/src/common/snippets/include/snippets/pass/tokenization.hpp index ee9c25f05104f7..24efcceec71a24 100644 --- a/src/common/snippets/include/snippets/pass/tokenization.hpp +++ b/src/common/snippets/include/snippets/pass/tokenization.hpp @@ -66,10 +66,10 @@ class SnippetsTokenization : public ov::pass::ModelPass { */ struct Config { Config(size_t concurrency, size_t data_ptr_gpr_count, bool split_m_dimension, bool enable_transpose_on_output, - bool dyn_mha_token, std::set mha_transpose_ranks, ov::pass::param_callback mha_tokenize_mm_b_input_callback = nullptr) + bool dyn_mha_token, std::set mha_transpose_ranks) : m_concurrency(concurrency), m_data_ptr_gpr_count(data_ptr_gpr_count), m_split_m_dimension(split_m_dimension), m_mha_token_enable_transpose_on_output(enable_transpose_on_output), m_is_dynamic_mha_token_enabled(dyn_mha_token), - m_mha_supported_transpose_ranks(std::move(mha_transpose_ranks)), m_mha_tokenize_mm_b_input_callback(std::move(mha_tokenize_mm_b_input_callback)) { + m_mha_supported_transpose_ranks(std::move(mha_transpose_ranks)) { OPENVINO_ASSERT(concurrency > 0, "Concurrency should be greater than 0"); OPENVINO_ASSERT(data_ptr_gpr_count > 0, "data_ptr_gpr_count should be greater than 0"); } @@ -102,10 +102,6 @@ class SnippetsTokenization : public ov::pass::ModelPass { return m_mha_supported_transpose_ranks; } - bool mha_tokenize_mm_b_input_callback(const std::shared_ptr& node) const { - return m_mha_tokenize_mm_b_input_callback ? m_mha_tokenize_mm_b_input_callback(node) : false; - } - private: size_t m_concurrency = 0; // The number of gpr that can be used as data pointers for data nodes (Parameter (and non-Scalar Constants), @@ -125,8 +121,6 @@ class SnippetsTokenization : public ov::pass::ModelPass { // Note that in general Snippets support Transpose of any ranks. // But at the moment Transpose is used only in MHA pattern where 3D and 4D tensors are supported. std::set m_mha_supported_transpose_ranks = { 3, 4 }; - - ov::pass::param_callback m_mha_tokenize_mm_b_input_callback = nullptr; }; OPENVINO_RTTI("SnippetsTokenization", "0"); diff --git a/src/common/snippets/src/pass/mha_tokenization.cpp b/src/common/snippets/src/pass/mha_tokenization.cpp index 96babcd54e5161..beb465ab3a3fbe 100644 --- a/src/common/snippets/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/src/pass/mha_tokenization.cpp @@ -355,9 +355,7 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken // We can allow to call this pass only if ops have scalar shapes to avoid shape mismatching const auto is_transposed_b_0 = matmul0->get_transpose_b(); bool has_matmul0_has_ops_on_input = false; - - const bool support_mm0_b_input_tokenization = !config.mha_tokenize_mm_b_input_callback(matmul0); - while (support_mm0_b_input_tokenization && is_supported_intermediate_op(parent)) { + while (is_supported_intermediate_op(parent)) { // All supported ops have only one output port if (parent->get_output_target_inputs(0).size() != 1) break; @@ -406,16 +404,12 @@ ov::snippets::pass::TokenizeMHASnippets::TokenizeMHASnippets(const SnippetsToken } }; - if (support_mm0_b_input_tokenization) { - const auto transpose1 = ov::as_type_ptr(parent); - tokenize_transpose(transpose1, is_transposed_b_0, get_decomposed_transpose_order(pattern_rank), ordered_ops.begin()); - } + const auto transpose1 = ov::as_type_ptr(parent); const auto transpose0 = ov::as_type_ptr(matmul0->get_input_node_shared_ptr(0)); + const auto transpose2 = ov::as_type_ptr(matmul1->get_input_node_shared_ptr(1)); + tokenize_transpose(transpose1, is_transposed_b_0, get_decomposed_transpose_order(pattern_rank), ordered_ops.begin()); tokenize_transpose(transpose0, matmul0->get_transpose_a(), get_fusion_transpose_order(pattern_rank), ordered_ops.begin()); - if (!config.mha_tokenize_mm_b_input_callback(matmul1)) { - const auto transpose2 = ov::as_type_ptr(matmul1->get_input_node_shared_ptr(1)); - tokenize_transpose(transpose2, matmul1->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.end()); - } + tokenize_transpose(transpose2, matmul1->get_transpose_b(), get_fusion_transpose_order(pattern_rank), ordered_ops.end()); ordered_ops.push_back(matmul1); bool are_ops_after_matmul1 = false; diff --git a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp index fc39b94fe1088d..9dd1da2d471e5a 100644 --- a/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp +++ b/src/plugins/intel_cpu/src/transformations/transformation_pipeline.cpp @@ -953,22 +953,9 @@ void Transformations::MainSnippets(void) { bool split_m_dimension = !ignoreCallback; // [122706] Some 3D MHA Patterns have perf regressions when Transpose op is tokenized std::set mha_supported_transpose_ranks = { 4 }; - - // If preliminary repacking is needed, it is executed outside the snippets kernel for performance reasons, - // so tokenization of ops sequences on matmul's B input is disabled - // Ticket 157743: This logic should be placed in CPU specific SubgraphPass. - auto mha_tokenize_mm_b_input_callback = [this](const std::shared_ptr& node) { - const auto& input_type_0 = node->get_input_element_type(0); - const auto& input_type_1 = node->get_input_element_type(1); - - const bool u8i8_repacking_wo_compensations = input_type_0 == ov::element::u8 && input_type_1 == ov::element::i8; - const bool bf16_repacking = input_type_0 == ov::element::f32 && input_type_1 == ov::element::f32 && - config.inferencePrecision == ov::element::bf16; - return u8i8_repacking_wo_compensations || bf16_repacking; - }; snippets::pass::SnippetsTokenization::Config tokenization_config(concurrency, data_ptr_gpr_count, split_m_dimension, mha_token_enable_transpose_on_output, is_dynamic_mha_token_enabled, - mha_supported_transpose_ranks, mha_tokenize_mm_b_input_callback); + mha_supported_transpose_ranks); ov::pass::Manager snippetsManager("CPU:Snippets"); snippetsManager.set_per_pass_validation(false); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp index 62b7a3390879e1..8517612a348f68 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mha.cpp @@ -3,9 +3,9 @@ // #include "common_test_utils/common_utils.hpp" +#include "common_test_utils/ov_tensor_utils.hpp" #include "common_test_utils/node_builders/constant.hpp" #include "common_test_utils/node_builders/fake_quantize.hpp" -#include "common_test_utils/ov_tensor_utils.hpp" #include "internal_properties.hpp" #include "shared_test_classes/base/ov_subgraph.hpp" #include "utils/cpu_test_utils.hpp" @@ -666,27 +666,15 @@ std::vector> matMulIn0PrecisionsQuant = { {ElementType::i8, ElementType::u8}, }; -INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern0_i8i8, +INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern0, MHAQuantTest, ::testing::Combine(::testing::ValuesIn(static_shapes_to_test_representation(inputShapesQuant)), ::testing::ValuesIn(inputPrecisionsQuant), - ::testing::Values(std::vector{ElementType::i8, ElementType::i8}), - ::testing::Values(0), - ::testing::Values(ExpectedNodes{ - {"Subgraph", 5}, // FQs on inputs x 3 + MHA + Deq Mul - {"Transpose", 1}}), // Transpose between MHA and Deq Mul + Extracted transpose on B input of 2nd MM - ::testing::Values(ov::test::utils::DEVICE_CPU)), - MHAQuantTest::getTestCaseName); - -INSTANTIATE_TEST_SUITE_P(smoke_MHAQuant_Pattern0_i8u8, - MHAQuantTest, - ::testing::Combine(::testing::ValuesIn(static_shapes_to_test_representation(inputShapesQuant)), - ::testing::ValuesIn(inputPrecisionsQuant), - ::testing::Values(std::vector{ElementType::i8, ElementType::u8}), + ::testing::ValuesIn(matMulIn0PrecisionsQuant), ::testing::Values(0), ::testing::Values(ExpectedNodes{ {"Subgraph", 5}, // FQs on inputs x 3 + MHA + Deq Mul - {"Transpose", 2}}), // Transpose between MHA and Deq Mul + Extracted transpose on B input of 2nd MM + {"Transpose", 1}}), // Transpose between MHA and Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU)), MHAQuantTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp index 0a12e0a36a3621..0c731b74565863 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_quantized.cpp @@ -48,7 +48,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(ov::element::f32), ::testing::Values(false), // The graph doesn't contain Multiply ::testing::Values(MHA::default_thread_count), - ::testing::Values(7), // FQx3 on inputs + MHA + Transpose on output + Transpose on Matmul's B input + Deq Mul + ::testing::Values(6), // FQx3 on inputs + MHA + Transpose on output + Deq Mul ::testing::Values(5), // FQx3 on inputs + MHA + Deq Mul ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), From 01115d35b32a9403ebdf1e7f4f2da8c6700c1046 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 21 Nov 2024 16:44:22 +0100 Subject: [PATCH 40/42] Conservatively extend SplitDimensionM::get_splited_dimensions --- .../snippets/src/pass/split_dimension_m.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/common/snippets/src/pass/split_dimension_m.cpp b/src/common/snippets/src/pass/split_dimension_m.cpp index 0f50ad27931e04..ae95a371483163 100644 --- a/src/common/snippets/src/pass/split_dimension_m.cpp +++ b/src/common/snippets/src/pass/split_dimension_m.cpp @@ -34,6 +34,8 @@ bool SplitDimensionM::is_supported_matmul(const std::shared_ptr& std::pair SplitDimensionM::get_splited_dimensions(size_t batch_dim, size_t m_dim, size_t optimal_parallelism_work_amount) { std::pair splited = { 1, m_dim }; + // Ideal case #1: M can be split on the parts one of which complements the batch dimension to the optimal parallel work amount + // In this case, each thread will execute the Snippets kernel once const size_t lower_bound = optimal_parallelism_work_amount / batch_dim; if (lower_bound * batch_dim == optimal_parallelism_work_amount && m_dim % lower_bound == 0) { splited.first = lower_bound; @@ -42,6 +44,19 @@ std::pair SplitDimensionM::get_splited_dimensions(size_t batch_d return splited; } + // Ideal case #2: M is divisible by optimal parallel work amount, and the new_m_dim is big enough + // In this case, each thread will execute the Snippets kernel 'batch_dim' times + if (m_dim % optimal_parallelism_work_amount == 0) { + const auto new_m_dim = m_dim / optimal_parallelism_work_amount; + const size_t min_kernel_m = 64; + if (new_m_dim >= min_kernel_m) { + splited.first = optimal_parallelism_work_amount; + splited.second = new_m_dim; + OPENVINO_ASSERT(splited.first * splited.second == m_dim, "Incorrect dimension M splitting!"); + return splited; + } + } + const size_t upper_bound = utils::div_up(2 * optimal_parallelism_work_amount, batch_dim); for (size_t divisor_0 = upper_bound - 1; divisor_0 > 1; divisor_0--) { size_t divisor_1 = m_dim / divisor_0; From d21a099ac390fd7dbab6533039b54601c4103672 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 21 Nov 2024 16:51:44 +0100 Subject: [PATCH 41/42] Revert changes in BF16 tests --- .../tests/functional/shared_tests_instances/snippets/mha.cpp | 2 +- .../shared_tests_instances/snippets/mha_with_dyn_mul.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp index 45bb055d086910..63f5176684ccc1 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha.cpp @@ -131,7 +131,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_MHAEnforceBF16, ::testing::Values(ov::element::bf16), ::testing::ValuesIn({false}), ::testing::Values(MHA::default_thread_count), - ::testing::Values(9), + ::testing::Values(7), ::testing::Values(6), ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::cpu_bf16_plugin_config)), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp index ccd23dd6833f98..7876d737af2281 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/mha_with_dyn_mul.cpp @@ -56,7 +56,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::ValuesIn(precision_f32(5)), ::testing::Values(ov::element::bf16), ::testing::Values(MHA::default_thread_count), - ::testing::Values(10), // MHA + 1 Transpose on output + 6 Converts around + 2 Transposes on Matmul's B inputs + ::testing::Values(8), // MHA + 1 Transpose on output + 6 Converts around ::testing::Values(7), // MHA + 6 Converts around ::testing::Values(ov::test::utils::DEVICE_CPU), ::testing::Values(CPUTestUtils::empty_plugin_config)), From 6df0b31ebcc0436cc7628d71676572147d7b04c9 Mon Sep 17 00:00:00 2001 From: Vladislav Golubev Date: Thu, 21 Nov 2024 17:10:45 +0100 Subject: [PATCH 42/42] Finilize snippets tests --- src/common/snippets/tests/src/pass/mha_tokenization.cpp | 2 +- src/common/snippets/tests/src/utils/split_dim_m.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common/snippets/tests/src/pass/mha_tokenization.cpp b/src/common/snippets/tests/src/pass/mha_tokenization.cpp index c5932ed690d670..382257f935cc49 100644 --- a/src/common/snippets/tests/src/pass/mha_tokenization.cpp +++ b/src/common/snippets/tests/src/pass/mha_tokenization.cpp @@ -204,7 +204,7 @@ TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHA4D_SplitM_withMul) { TEST_F(TokenizeMHASnippetsTests, smoke_Snippets_MHAWOTranspose_SplitM) { const auto& f = MHAWOTransposeSplitMFunction(std::vector{{10, 9216, 128}, {10, 128, 9216}, {10, 9216, 128}}, std::vector({ov::element::f32, ov::element::f32, ov::element::f32}), - std::vector{{10, 3, 3072, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}}); + std::vector{{10, 18, 512, 128}, {10, 1, 128, 9216}, {10, 1, 9216, 128}, {10, 9216, 128}}); model = f.getOriginal(); model_ref = f.getReference(); config.set_concurrency(18); diff --git a/src/common/snippets/tests/src/utils/split_dim_m.cpp b/src/common/snippets/tests/src/utils/split_dim_m.cpp index 69a04da6f1263f..9e801fceae02e9 100644 --- a/src/common/snippets/tests/src/utils/split_dim_m.cpp +++ b/src/common/snippets/tests/src/utils/split_dim_m.cpp @@ -58,6 +58,7 @@ const std::vector split_dimension_cases = { {InputData{15, 45, 40}, ReferenceData{true, 5, 9}}, {InputData{25, 50, 40}, ReferenceData{true, 2, 25}}, {InputData{5, 16384, 40}, ReferenceData{true, 8, 2048}}, + {InputData{5, 16384, 32}, ReferenceData{true, 32, 512}}, }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_SplitDimensionM,