diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp index 92b321e88e8b68..f21a6951fedd62 100644 --- a/src/common/snippets/include/snippets/generator.hpp +++ b/src/common/snippets/include/snippets/generator.hpp @@ -84,7 +84,7 @@ class Schedule { * @param f can this kernel be linearided to 1D range * @param p pointer to generated code */ - Schedule(const Shape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {} + Schedule(const ov::PartialShape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {} /** * @brief Returns callable instanse of code pointer */ @@ -92,7 +92,7 @@ class Schedule { return reinterpret_cast(const_cast(ptr)); } - Shape work_size {}; + ov::PartialShape work_size {}; bool is_flat {false}; code ptr {nullptr}; }; @@ -123,7 +123,7 @@ class Generator { * @brief gets target machine * @return pointer to constant target machine */ - std::shared_ptr get_target_machine() const { return target; } + std::shared_ptr get_target_machine() const; protected: std::shared_ptr target; diff --git a/src/common/snippets/include/snippets/op/broadcastload.hpp b/src/common/snippets/include/snippets/op/broadcastload.hpp index 0d90fb15a84b97..851c0ca8c3ea7d 100644 --- a/src/common/snippets/include/snippets/op/broadcastload.hpp +++ b/src/common/snippets/include/snippets/op/broadcastload.hpp @@ -21,7 +21,7 @@ class BroadcastLoad : public BroadcastMove { public: OPENVINO_OP("BroadcastLoad", "SnippetsOpset", ngraph::snippets::op::BroadcastMove); - BroadcastLoad(const Output& x, Shape output_shape); + BroadcastLoad(const Output& x, ov::PartialShape output_shape); BroadcastLoad() = default; bool visit_attributes(AttributeVisitor& visitor) override; @@ -29,17 +29,6 @@ class BroadcastLoad : public BroadcastMove { std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; void validate_and_infer_types() override; - - void set_broadcast_info(const Shape& bct) { - broadcast_info = bct; - } - - bool is_broadcast(size_t idx) { - return broadcast_info[idx] == 1; - } - -private: - Shape broadcast_info; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/broadcastmove.hpp b/src/common/snippets/include/snippets/op/broadcastmove.hpp index 0ab279f1ac814e..8514d61898b8df 100644 --- a/src/common/snippets/include/snippets/op/broadcastmove.hpp +++ b/src/common/snippets/include/snippets/op/broadcastmove.hpp @@ -19,7 +19,7 @@ class BroadcastMove : public ngraph::op::Op { public: OPENVINO_OP("BroadcastMove", "SnippetsOpset"); - BroadcastMove(const Output& x, Shape output_shape); + BroadcastMove(const Output& x, ov::PartialShape output_shape); BroadcastMove() = default; bool visit_attributes(AttributeVisitor& visitor) override; @@ -28,12 +28,9 @@ class BroadcastMove : public ngraph::op::Op { void validate_and_infer_types() override; - OPENVINO_SUPPRESS_DEPRECATED_START - bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override; - OPENVINO_SUPPRESS_DEPRECATED_END protected: - Shape output_shape; + ov::PartialShape output_shape; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp index 7f53240ae21946..a263db7581987f 100644 --- a/src/common/snippets/include/snippets/op/load.hpp +++ b/src/common/snippets/include/snippets/op/load.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include "snippets/op/memory_access.hpp" namespace ngraph { namespace snippets { @@ -17,29 +18,14 @@ namespace op { * Default value is "1" - to load one element * @ingroup snippets */ -class Load : public ngraph::op::Op { +class Load : public MemoryAccess { public: OPENVINO_OP("Load", "SnippetsOpset"); Load(const Output& x, const size_t count = 1lu); Load() = default; - size_t get_count() const { return m_count; } - - void set_count(const size_t count) { m_count = count; } - - bool visit_attributes(AttributeVisitor& visitor) override; - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - - void validate_and_infer_types() override; - - OPENVINO_SUPPRESS_DEPRECATED_START - bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override; - OPENVINO_SUPPRESS_DEPRECATED_END - -protected: - size_t m_count = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/loop.hpp b/src/common/snippets/include/snippets/op/loop.hpp new file mode 100644 index 00000000000000..519cc53ddd3eaf --- /dev/null +++ b/src/common/snippets/include/snippets/op/loop.hpp @@ -0,0 +1,92 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" +#include "snippets/emitter.hpp" +#include "ngraph/op/parameter.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface LoopBase + * @brief Inserted during scheduling generation and represents Loop in affine notation + * @ingroup snippets + */ +class LoopBase : public ngraph::op::Op { +public: + OPENVINO_OP("LoopBase", "SnippetsOpset"); + LoopBase(const std::vector>& args, size_t dimension, size_t work_amount, size_t increment); + LoopBase() = delete; + bool visit_attributes(AttributeVisitor& visitor) override; + size_t get_work_amount() const; + size_t get_increment() const; + size_t get_dimension() const; + bool get_evaluate_once() const; + +protected: + size_t dimension; + size_t work_amount; + size_t increment; + bool evaluate_once; // true if the Loop is executed only once, used to skip setting and testing the loop counter +}; +class LoopEnd; +class LoopBegin : public LoopBase { + friend LoopEnd; +public: + OPENVINO_OP("LoopBegin", "SnippetsOpset"); + /// \brief Construct an Loop + /// \param region The vector of pairs: emitters and the corresponding registers + /// \param increment Loop size - count of elements to load and store. + /// Vector Loop should have size of vector register and Scalar Loop should have 1 + /// \param num_inputs Count of inputs + /// \param num_outputs Count of outputs + /// \param io_dims Vector of last dimensions of inputs and outputs + /// \param io_data_sizes Vector of data type sizes of inputs and outputs + explicit LoopBegin(const std::vector>& args); + LoopBegin() = delete; + void validate_and_infer_types() override; + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; + std::shared_ptr get_loop_end(); + // begin_address and input_regs are needed to communicate information between LoopBegin and LoopEnd emitters + const uint8_t* begin_address; + std::vector input_regs; +private: + void validate_and_infer_types_except_LoopEnd(); + LoopBegin(const std::vector>& args, size_t dimension, size_t work_amount, size_t increment); +}; + +class LoopEnd : public LoopBase { +public: + OPENVINO_OP("LoopEnd", "SnippetsOpset"); + LoopEnd(const std::vector>& args, size_t dimension, size_t work_amount, size_t increment, + std::vector apply_increment, std::vector finalization_offsets); + LoopEnd() = delete; + std::shared_ptr get_loop_begin(); + void validate_and_infer_types() override; + std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override; + const std::vector& get_finalization_offsets() const; + const std::vector& get_apply_increment() const; + void set_finalization_offsets(std::vector offsets); + void set_apply_increment(std::vector apply_increment); + void set_work_amount(size_t new_work_amount); + void set_increment(size_t new_increment); + void set_evaluate_once(bool once); + // Used to propagate information about Loop structure, needed to simplify some optimizations. For example, + // to skip pointer increments when outer Loop is empty, and work_amount == vector_size (one inner vector Loop) + // true by default, the optimizations enabled if it's false; + bool has_outer_loop; + +private: + std::vector apply_increment; + std::vector finalization_offsets; + size_t loop_io_size; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/loop_helpers.hpp b/src/common/snippets/include/snippets/op/loop_helpers.hpp new file mode 100644 index 00000000000000..57a14e5f036cc9 --- /dev/null +++ b/src/common/snippets/include/snippets/op/loop_helpers.hpp @@ -0,0 +1,67 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ngraph/op/op.hpp" +#include "ngraph/op/parameter.hpp" +#include "loop.hpp" + +namespace ngraph { +namespace snippets { +namespace op { + +/* ==== LoopBegin === */ +std::shared_ptr insertLoopBeginAfterOutputs(const OutputVector& originalOutputs); + +template +std::shared_ptr insertLoopBegin(const T& afterTheseNodes) { + static_assert(std::is_same() || std::is_same(), + "Unsupported template parameter for insertLoopBegin. Only ParameterVector or NodeVector is allowed"); + OutputVector originalOutputs; + std::vector>> childInputs; + for (const auto &n : afterTheseNodes) { + const auto& nodeOutputs = n->outputs(); + // Ignore the LoopBegin->LoopEnd edge to make it easier to construct enclosed Loops + std::move(nodeOutputs.begin(), nodeOutputs.end() - 1 * ov::is_type(n), std::back_inserter(originalOutputs)); + } + + return insertLoopBeginAfterOutputs(originalOutputs); +} + +template<> +inline std::shared_ptr insertLoopBegin(const OutputVector& afterTheseNodes) { + return insertLoopBeginAfterOutputs(afterTheseNodes); +} +/* ============== */ + +/* ==== LoopEnd === */ +std::shared_ptr insertLoopEndBeforeInputs(const std::vector>& originalInputs, + const std::shared_ptr& tileBegin, + size_t dimension, size_t work_amount, size_t increment, + std::vector apply_increment = {}, + std::vector finalization_offsets = {}); + +template +std::shared_ptr insertLoopEnd(const T& beforeTheseNodes, Args ...args) { + static_assert(std::is_same() || std::is_same(), + "Unsupported template parameter for insertLoopBegin. Only ParameterVector or NodeVector is allowed"); + std::vector> originalInputs; + for (const auto &n : beforeTheseNodes) { + const auto& nodeInputs = n->inputs(); + // Ignore the LoopBegin->LoopEnd edge to facilitate enclosed Loops construction + std::move(nodeInputs.begin(), nodeInputs.end() - 1 * ov::is_type(n), std::back_inserter(originalInputs)); + } + return insertLoopEndBeforeInputs(originalInputs, args...); +} + +template +std::shared_ptr insertLoopEnd(const std::vector>& beforeTheseNodes, Args ...args) { + return insertLoopEndBeforeInputs(beforeTheseNodes, args...); +} +/* ============== */ + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/memory_access.hpp b/src/common/snippets/include/snippets/op/memory_access.hpp new file mode 100644 index 00000000000000..22aca3f358be4c --- /dev/null +++ b/src/common/snippets/include/snippets/op/memory_access.hpp @@ -0,0 +1,38 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +/** + * @interface MemoryAccess + * @brief This is an ubre + * where number of elements to store is determined by "count" + * Default value is "1" - to store one element + * @ingroup snippets + */ + +class MemoryAccess : public ngraph::op::Op { +public: + OPENVINO_OP("MemoryAccess", "SnippetsOpset"); + + size_t get_count() const; + void set_count(size_t count); + bool visit_attributes(AttributeVisitor& visitor) override; + void validate_and_infer_types() override; + +protected: + explicit MemoryAccess(const Output& x, size_t count = 1lu); + MemoryAccess() = default; + size_t m_count = 0lu; +}; + +} // namespace op +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/op/scalar.hpp b/src/common/snippets/include/snippets/op/scalar.hpp index a8de072be50f10..1916f554a32c67 100644 --- a/src/common/snippets/include/snippets/op/scalar.hpp +++ b/src/common/snippets/include/snippets/op/scalar.hpp @@ -34,6 +34,7 @@ class Scalar : public ov::op::v0::Constant { std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; void validate_and_infer_types() override; + bool visit_attributes(AttributeVisitor& visitor) override; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp index 0ff5cc3ec8e063..b83a4fdcec2b18 100644 --- a/src/common/snippets/include/snippets/op/store.hpp +++ b/src/common/snippets/include/snippets/op/store.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include "snippets/op/memory_access.hpp" namespace ngraph { namespace snippets { @@ -17,29 +18,14 @@ namespace op { * Default value is "1" - to store one element * @ingroup snippets */ -class Store : public ngraph::op::Op { +class Store : public MemoryAccess { public: OPENVINO_OP("Store", "SnippetsOpset"); Store(const Output& x, const size_t count = 1lu); Store() = default; - size_t get_count() const { return m_count; } - - void set_count(const size_t count) { m_count = count; } - - bool visit_attributes(AttributeVisitor& visitor) override; - std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; - - void validate_and_infer_types() override; - - OPENVINO_SUPPRESS_DEPRECATED_START - bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override; - OPENVINO_SUPPRESS_DEPRECATED_END - -protected: - size_t m_count = 0lu; }; } // namespace op diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp index 43c6376ad607c1..dfcde2bd4fd2c6 100644 --- a/src/common/snippets/include/snippets/op/subgraph.hpp +++ b/src/common/snippets/include/snippets/op/subgraph.hpp @@ -25,6 +25,7 @@ namespace op { class Subgraph : public ngraph::op::Op { public: OPENVINO_OP("Subgraph", "SnippetsOpset"); + enum {DYNAMIC_DIMENSION = 0xffffffffffffffff}; // < 1, 42, 17, 15, 16> < 0, 1, 2, 3, 1> // should be: @@ -67,7 +68,7 @@ class Subgraph : public ngraph::op::Op { // // D = < 1, 3, 17, 15, 32> < 0, 1, 2, 3, 4> // E = < 1, 3, 17, 1, 32> < 0, 1, 2, 3, 4> - using BlockedShape = std::tuple; + using BlockedShape = std::tuple; using BlockedShapeVector = std::vector; Subgraph(const OutputVector& args, std::shared_ptr body); @@ -105,7 +106,10 @@ class Subgraph : public ngraph::op::Op { snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, const void* compile_params = nullptr); snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr); snippets::Schedule generate(const void* compile_params = nullptr); - Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes); + ov::PartialShape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes); + ov::PartialShape get_master_shape(); + std::vector reshape_body(const std::vector& input_shapes); + std::vector reshape_body(const std::vector& input_shapes); // plugin sets generator for a snippet to some specific generator. // it's going to be replaced with Jitters table later @@ -116,6 +120,7 @@ class Subgraph : public ngraph::op::Op { void print_statistics(bool verbose); void serialize() const; + void set_master_shape(ov::PartialShape new_shape) {master_shape = std::move(new_shape);} static auto wrap_node_as_subgraph(const std::shared_ptr& node) -> std::shared_ptr; static void fill_empty_output_names(const Output& target_output_node, const Output& replacement_output_node); @@ -146,6 +151,8 @@ class Subgraph : public ngraph::op::Op { // because TypeRelaxed::copy_with_new_inputs() isn't save-thread method bool m_has_type_relaxed_ops = false; } config; + + ov::PartialShape master_shape; }; static inline std::ostream& operator<<(std::ostream& os, const op::Subgraph::BlockedShape& blocked_shape) { diff --git a/src/common/snippets/include/snippets/op/tile.hpp b/src/common/snippets/include/snippets/op/tile.hpp deleted file mode 100644 index ac1d6ef4d1a2b9..00000000000000 --- a/src/common/snippets/include/snippets/op/tile.hpp +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "ngraph/op/op.hpp" -#include "snippets/emitter.hpp" - -namespace ngraph { -namespace snippets { -namespace op { - -/** - * @interface Tile - * @brief Generated by Canonicalization and represents Loop in affine notation - * @ingroup snippets - */ -class Tile : public ngraph::op::Op { -public: - OPENVINO_OP("Tile", "SnippetsOpset"); - - /// \brief Construct an Tile - /// \param region The vector of pairs: emitters and the corresponding registers - /// \param increment Tile size - count of elements to load and store. - /// Vector Tile should have size of vector register and Scalar Tile should have 1 - /// \param num_inputs Count of inputs - /// \param num_outputs Count of outputs - /// \param io_dims Vector of last dimensions of inputs and outputs - /// \param io_data_sizes Vector of data type sizes of inputs and outputs - Tile(const std::vector& region, size_t increment, size_t num_inputs, size_t num_outputs, - const std::vector& io_dims, const std::vector& io_data_sizes); - Tile() = default; - std::vector region; - size_t increment = 0; - size_t num_inputs = 0; - size_t num_outputs = 0; - std::vector io_dims {}; - std::vector io_data_size {}; - - std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { - return std::make_shared(region, increment, num_inputs, num_outputs, io_dims, io_data_size); - } -}; - -} // namespace op -} // namespace snippets -} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/include/snippets/op/tile_scheduler.hpp b/src/common/snippets/include/snippets/op/tile_scheduler.hpp deleted file mode 100644 index 9d6010f77978b0..00000000000000 --- a/src/common/snippets/include/snippets/op/tile_scheduler.hpp +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (C) 2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "ngraph/op/op.hpp" -#include "snippets/emitter.hpp" -#include "tile.hpp" - -namespace ngraph { -namespace snippets { -namespace op { - -/** - * @interface TileScheduler - * @brief Contains a set of Tiles (currently one vector and one scalar) and performs necessary preparations - * before the Tiles could be executed: calculates offsets, sets proper work amounts, decrement pointers if the same data - * have to be read several times (broadcasting). - * @ingroup snippets - */ -class TileScheduler : public ngraph::op::Op { -public: - OPENVINO_OP("TileScheduler", "SnippetsOpset"); - - TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region); - TileScheduler() = default; - AllocatedEmitter vector_region; - AllocatedEmitter scalar_region; - // todo: this clone_with_new_inputs is irrelevant - std::shared_ptr clone_with_new_inputs(const OutputVector& inputs) const override { - return std::make_shared(vector_region, scalar_region); - } - const void *compile_params; -}; - -} // namespace op -} // namespace snippets -} // namespace ngraph diff --git a/src/common/snippets/include/snippets/pass/insert_loops.hpp b/src/common/snippets/include/snippets/pass/insert_loops.hpp new file mode 100644 index 00000000000000..874fc688e404a5 --- /dev/null +++ b/src/common/snippets/include/snippets/pass/insert_loops.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +namespace ngraph { +namespace snippets { +namespace pass { + +/** + * @interface InsertLoops + * @brief Insert explicit Loop operations into the body to process multiple data entities during one kernel execution + * @ingroup snippets + */ +class InsertLoops: public ngraph::pass::FunctionPass { +public: + OPENVINO_RTTI("InsertLoops", "0"); + InsertLoops(ov::PartialShape master_shape, size_t vector_size); + bool run_on_model(const std::shared_ptr& m) override; + +private: + ov::PartialShape master_shape; + size_t vector_size; +}; + +} // namespace pass +} // namespace snippets +} // namespace ngraph diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp index f1c0e9056d66eb..1137de1db0c76c 100644 --- a/src/common/snippets/include/snippets/snippets_isa.hpp +++ b/src/common/snippets/include/snippets/snippets_isa.hpp @@ -17,8 +17,7 @@ #include "op/scalar.hpp" #include "op/powerstatic.hpp" #include "op/store.hpp" -#include "op/tile.hpp" -#include "op/tile_scheduler.hpp" +#include "op/loop.hpp" namespace ngraph { namespace snippets { diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index 0e85fe72861a21..d34b93392a09c8 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -6,34 +6,35 @@ #include "snippets/pass/assign_registers.hpp" #include "snippets/pass/vector_to_scalar.hpp" #include "snippets/pass/insert_load_store.hpp" -#include "snippets/op/tile.hpp" +#include "snippets/op/loop.hpp" +#include "snippets/op/subgraph.hpp" #include "snippets/op/kernel.hpp" #include #include +#include -auto ngraph::snippets::getRegisters(std::shared_ptr& n) -> ngraph::snippets::RegInfo { +namespace ngraph { +namespace snippets { + +auto getRegisters(const std::shared_ptr &n) -> RegInfo { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::getRegisters") - auto rt = n->get_rt_info(); // ToDo: change to reg_t std::vector rin, rout; - auto it_rt = rt.find("reginfo"); - if (it_rt != rt.end()) { - for (auto reg : it_rt->second.as>()) { - rout.push_back(reg); - } + for (const auto& output : n->outputs()) { + const auto& rt = output.get_tensor_ptr()->get_rt_info(); + auto it_rt = rt.find("reginfo"); + if (it_rt != rt.end()) + rout.push_back(it_rt->second.as()); } for (const auto& input : n->inputs()) { - auto rt = input.get_source_output().get_node_shared_ptr()->get_rt_info(); + auto rt = input.get_source_output().get_tensor_ptr()->get_rt_info(); auto it_rt = rt.find("reginfo"); - if (it_rt != rt.end()) { - for (auto& reg : it_rt->second.as>()) { - rin.push_back(reg); - } - } + if (it_rt != rt.end()) + rin.push_back(it_rt->second.as()); } return std::make_pair(rin, rout); } @@ -42,70 +43,143 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptris_supported()) - throw ngraph_error("unsupported architecture for code genration"); + throw ngraph_error("unsupported architecture for code generation"); auto params = m->get_parameters(); auto results = m->get_results(); auto in = params.size(); auto out = results.size(); + std::vector io_last_dims(in + out); std::vector io_data_sizes(in + out); std::transform(params.begin(), params.end(), io_last_dims.begin(), - [](const std::shared_ptr& n){return n->get_output_shape(0).back();}); + [](const std::shared_ptr& n){ + auto last_dim = n->get_output_partial_shape(0).rbegin(); + return last_dim->is_dynamic() ? op::Subgraph::DYNAMIC_DIMENSION + : last_dim->get_length(); + }); std::transform(results.begin(), results.end(), io_last_dims.begin() + in, - [](const std::shared_ptr& n){return n->get_input_shape(0).back();}); + [](const std::shared_ptr &n) { + auto last_dim = n->get_input_partial_shape(0).rbegin(); + return last_dim->is_dynamic() ? op::Subgraph::DYNAMIC_DIMENSION + : last_dim->get_length(); + }); std::transform(params.begin(), params.end(), io_data_sizes.begin(), [](const std::shared_ptr& n){return n->get_element_type().size();}); std::transform(results.begin(), results.end(), io_data_sizes.begin() + in, [](const std::shared_ptr& n){return n->get_element_type().size();}); OV_ITT_TASK_CHAIN(GENERATE, ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::VectorTile") - // vector tile + // vector loop std::vector lowered; - for (auto n : m->get_ordered_ops()) { - lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); - } - OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile") - - // scalar tile - auto m_scalar = ov::clone_model(*m.get()); - ngraph::pass::Manager mng; - mng.register_pass(); - mng.register_pass(); - mng.run_passes(m_scalar); - OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile_get") - std::vector scalar_lowered; - for (auto n : m_scalar->get_ordered_ops()) { - scalar_lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n))); + auto lower_ops = [&lowered, this](const NodeVector& ops){ + std::transform(ops.begin(), ops.end(), std::back_inserter(lowered), + [this](const std::shared_ptr& n){ + return std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)); + }); + }; + // *1* solo vector/tail loop + empty outer loop + // => skip increments (both counter & ptr) : set evaluate_once flag + // *2* solo vector/tail loop + non-empty outer loop + // => skip counter increments but perform ptr increments : set evaluate_once, + // and perform pointer increments through finalization offsets + // *3* vector loop(s) + one tail loop + // => vector as usual, tail depends on outer loop, see *1* and *2* + auto optimize_single_evaluation = [](const std::shared_ptr& loop, bool force_ptr_increment = false) { + if (loop->get_work_amount() < 2 * loop->get_increment()) { + loop->set_evaluate_once(true); + if (force_ptr_increment || loop->has_outer_loop) { + const auto increment = loop->get_increment(); + std::vector new_finalization_offsets(loop->get_finalization_offsets()); + const auto& apply_increments = loop->get_apply_increment(); + for (auto i = 0; i < new_finalization_offsets.size(); i++) { + new_finalization_offsets[i] += increment * apply_increments[i]; + } + loop->set_finalization_offsets(new_finalization_offsets); + } + return true; + } else { + return false; + } + }; + const auto& ops = m->get_ordered_ops(); + for (auto op = ops.begin(); op < ops.end(); op++) { + const auto& loop_begin = ov::as_type_ptr(*op); + // ignore outer loops and possible manual tail loops + if (loop_begin && loop_begin->get_increment() != 1) { + NodeVector vector_loop, tail_loop; + std::shared_ptr vector_loop_end, tail_loop_end; + vector_loop_end = loop_begin->get_loop_end(); + tail_loop_end = nullptr; + while (*op != vector_loop_end) + vector_loop.push_back(*op++); + vector_loop.push_back(*op); + const auto work_amount = vector_loop_end->get_work_amount(); + const auto increment = vector_loop_end->get_increment(); + const auto tail_size = work_amount % increment; + const auto need_tail = tail_size != 0; + const auto need_vector_loop = work_amount >= increment; + // Note, that finalization_offsets could be modified inside optimize_single_evaluation, + // so need to save them here to cover (evaluate_once vector with non-zero finalization_offsets + tail) + std::vector tail_finalization_offsets = need_tail ? vector_loop_end->get_finalization_offsets() : std::vector {}; + // vector loops are required => Just copy the body, original loop is already a vector one + if (need_vector_loop) { + // Note that finalization offsets should be applied after the last iteration. + // So if there is a tail, then we should apply offsets after it, but not now. + if (need_tail) + vector_loop_end->set_finalization_offsets(std::vector(tail_finalization_offsets.size(), 0)); + // force ptr increments if there is tail + optimize_single_evaluation(vector_loop_end, need_tail); + lower_ops(vector_loop); + } + OV_ITT_TASK_NEXT(GENERATE, "::TailLoop") + // tail is required => transform the body into a tail representation + // tail loop is fake loop because for tail we should calculate only + // finalization offsets which are supported by LoopEnd. + if (need_tail) { + NodeMap vector_to_tail_node_map; + tail_loop = ngraph::clone_nodes(vector_loop, vector_to_tail_node_map); + std::transform(tail_loop.begin(), tail_loop.end(), tail_loop.begin(), + [tail_size](const std::shared_ptr& n){ + const auto& memory_access = std::dynamic_pointer_cast(n); + if (memory_access && memory_access->get_count() != 1) { + memory_access->set_count(tail_size); + } + return n; + }); + tail_loop_end = ov::as_type_ptr(*tail_loop.rbegin()); + tail_loop_end->set_finalization_offsets(tail_finalization_offsets); + tail_loop_end->set_increment(tail_size); + tail_loop_end->set_work_amount(tail_size); + tail_loop_end->has_outer_loop = vector_loop_end->has_outer_loop; + // tail loop is always executed once + optimize_single_evaluation(tail_loop_end); + lower_ops(tail_loop); + } + } else { + lower_ops({*op}); + } } - OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D"); - // wrapping into tiles1D - //todo: in, out, and io_last_dims should derive naturally from the graph representation - const auto& vector_tile = std::make_shared(lowered, target->get_lanes(), in, out, io_last_dims, io_data_sizes); - const auto& vector_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(vector_tile), - std::make_pair(std::vector{}, std::vector{})); - const auto& scalar_tile = std::make_shared(scalar_lowered, 1, in, out, io_last_dims, io_data_sizes); - const auto& scalar_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(scalar_tile), - std::make_pair(std::vector{}, std::vector{})); - - OV_ITT_TASK_NEXT(GENERATE, "::Tiles2D") - // wrapping into tiles2D - auto tile_scheduler = std::make_shared(vector_region, scalar_region); - tile_scheduler->compile_params = compile_params; - const auto& tile_scheduler_region = std::make_pair(target->get(ngraph::snippets::op::TileScheduler::get_type_info_static())(tile_scheduler), - std::make_pair(std::vector({in, out, target->get_lanes()}), std::vector{})); OV_ITT_TASK_NEXT(GENERATE, "::EmitCode") // emission - auto tiles2DKernel = std::make_shared(std::vector {tile_scheduler_region}); - tiles2DKernel->compile_params = compile_params; - std::shared_ptr kernel = target->get(ngraph::snippets::op::Kernel::get_type_info_static())(tiles2DKernel); + auto loops2DKernel = std::make_shared(std::vector{lowered}); + loops2DKernel->compile_params = compile_params; + std::shared_ptr kernel = target->get(op::Kernel::get_type_info_static())(loops2DKernel); + kernel->emit_code({in, out}, {}); + OV_ITT_TASK_NEXT(GENERATE, "::EmitData") - lowered.insert(lowered.end(), scalar_lowered.begin(), scalar_lowered.end()); for (auto& op : lowered) { op.first->emit_data(); } OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet") return target->get_snippet(); } + +std::shared_ptr Generator::get_target_machine() const { + return target; +} + +}// namespace snippets +}// namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/op/broadcastload.cpp b/src/common/snippets/src/op/broadcastload.cpp index 893cae32831c51..04ba89f48775e4 100644 --- a/src/common/snippets/src/op/broadcastload.cpp +++ b/src/common/snippets/src/op/broadcastload.cpp @@ -11,8 +11,7 @@ using namespace std; using namespace ngraph; -snippets::op::BroadcastLoad::BroadcastLoad(const Output& x, Shape shape) -: BroadcastMove(x, shape), broadcast_info(x.get_shape().size(), 0) { +snippets::op::BroadcastLoad::BroadcastLoad(const Output& x, ov::PartialShape shape) : BroadcastMove(x, std::move(shape)) { constructor_validate_and_infer_types(); } @@ -23,9 +22,7 @@ bool snippets::op::BroadcastLoad::visit_attributes(AttributeVisitor& visitor) { std::shared_ptr snippets::op::BroadcastLoad::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(BroadcastLoad); check_new_args_count(this, new_args); - auto other = std::make_shared(new_args.at(0), output_shape); - other->set_broadcast_info(this->broadcast_info); - return other; + return std::make_shared(new_args.at(0), output_shape); } void snippets::op::BroadcastLoad::validate_and_infer_types() { diff --git a/src/common/snippets/src/op/broadcastmove.cpp b/src/common/snippets/src/op/broadcastmove.cpp index 089cd8f2abd70b..e8c733a44d6345 100644 --- a/src/common/snippets/src/op/broadcastmove.cpp +++ b/src/common/snippets/src/op/broadcastmove.cpp @@ -12,7 +12,7 @@ using namespace std; using namespace ngraph; -snippets::op::BroadcastMove::BroadcastMove(const Output& x, Shape shape) : Op({x}), output_shape(shape) { +snippets::op::BroadcastMove::BroadcastMove(const Output& x, ov::PartialShape shape) : Op({x}), output_shape(std::move(shape)) { constructor_validate_and_infer_types(); } @@ -23,44 +23,9 @@ bool snippets::op::BroadcastMove::visit_attributes(AttributeVisitor& visitor) { std::shared_ptr snippets::op::BroadcastMove::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(BroadcastMove); check_new_args_count(this, new_args); - auto other = std::make_shared(new_args.at(0), this->output_shape); - return other; + return std::make_shared(new_args.at(0), output_shape); } void snippets::op::BroadcastMove::validate_and_infer_types() { set_output_type(0, get_input_element_type(0), this->output_shape); -} - -bool snippets::op::BroadcastMove::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const { - INTERNAL_OP_SCOPE(BroadcastMove); - NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config"); - NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config"); - NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation"); - NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port"); - NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape"); - - auto ishape = input_values[0]->get_shape(); - auto oshape = output_values[0]->get_shape(); - - NGRAPH_CHECK(ishape.size() == oshape.size(), "input and output should have the same rank"); - - AxisSet broadcast_axes; - for (size_t k = 0; k < ishape.size(); k++) { - if (!((ishape[k] == oshape[k]) - || (ishape[k] != oshape[k] && ((ishape[k] == 1) != (oshape[k] == 1) ) ))) { - throw ngraph_error("FakeBroadcast::evaluate incompatible shapes"); - } - - if (ishape[k] != oshape[k]) { - broadcast_axes.insert(k); - } - } - - runtime::reference::broadcast(input_values[0]->get_data_ptr(), - output_values[0]->get_data_ptr(), - input_values[0]->get_shape(), - output_values[0]->get_shape(), - broadcast_axes, - sizeof(float)); - return true; } \ No newline at end of file diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp index 1ac4df725fe75d..f7da7c16b1b411 100644 --- a/src/common/snippets/src/op/load.cpp +++ b/src/common/snippets/src/op/load.cpp @@ -8,39 +8,20 @@ #include -using namespace std; -using namespace ngraph; +namespace ngraph { +namespace snippets { +namespace op { -snippets::op::Load::Load(const Output& x, const size_t count) : Op({x}), m_count(count) { +Load::Load(const Output& x, const size_t count) : MemoryAccess({x}, count) { constructor_validate_and_infer_types(); } -bool snippets::op::Load::visit_attributes(AttributeVisitor& visitor) { - return true; -} - -std::shared_ptr snippets::op::Load::clone_with_new_inputs(const OutputVector& new_args) const { +std::shared_ptr Load::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Load); check_new_args_count(this, new_args); return std::make_shared(new_args.at(0), m_count); } -void snippets::op::Load::validate_and_infer_types() { - set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); -} - -bool snippets::op::Load::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const { - INTERNAL_OP_SCOPE(Load); - NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config"); - NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config"); - NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation"); - NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port"); - NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape"); - NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape"); - - std::copy(input_values[0]->get_data_ptr(), - input_values[0]->get_data_ptr() + shape_size(get_output_shape(0))*output_values[0]->get_element_type().size(), - output_values[0]->get_data_ptr()); - - return true; -} +}// namespace op +}// namespace snippets +}// namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/op/loop.cpp b/src/common/snippets/src/op/loop.cpp new file mode 100644 index 00000000000000..c36b713b8f0496 --- /dev/null +++ b/src/common/snippets/src/op/loop.cpp @@ -0,0 +1,169 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "snippets/op/loop.hpp" +#include "snippets/generator.hpp" + +using namespace std; +namespace ngraph { +namespace snippets { +namespace op { + +LoopBase::LoopBase(const std::vector> &args, size_t dimension, size_t work_amount, size_t increment) + : Op(args), dimension(dimension), work_amount(work_amount), increment(increment), evaluate_once(false) { +} + +bool LoopBase::visit_attributes(AttributeVisitor &visitor) { + visitor.on_attribute("dimension", dimension); + visitor.on_attribute("work_amount", work_amount); + visitor.on_attribute("increment", increment); + return true; +} + +size_t LoopBase::get_work_amount() const { + return work_amount; +} + +bool LoopBase::get_evaluate_once() const { + return evaluate_once; +} + +size_t LoopBase::get_increment() const { + return increment; +} + +size_t LoopBase::get_dimension() const { + return dimension; +} + +LoopBegin::LoopBegin(const std::vector> &args, size_t dimension, size_t work_amount, size_t increment) + : LoopBase(args, dimension, work_amount, increment), + begin_address(nullptr), input_regs({}) { + // We can only call a reduced validate_and_infer types from the constructor, since LoopEnd might not be attached + // to the LoopBegin at this point (which is usually the case: create LoopBegin first => then attach LoopEnd to it) + validate_and_infer_types_except_LoopEnd(); +} + +LoopBegin::LoopBegin(const std::vector> &args) + : LoopBase(args, 0, 0, 0), begin_address(nullptr), input_regs({}) { + validate_and_infer_types_except_LoopEnd(); +} + +std::shared_ptr LoopBegin::clone_with_new_inputs(const OutputVector& inputs) const { + return std::shared_ptr(new LoopBegin(inputs, dimension, work_amount, increment)); +} + + +void LoopBegin::validate_and_infer_types_except_LoopEnd() { + const size_t num_inputs = get_input_size(); + set_output_size(num_inputs + 1); + // All outputs are by-passed from inputs, except for the last one - it connects LoopBegin and LoopEnd + for (int i = 0; i < num_inputs; i++) + get_output_descriptor(i).set_tensor_ptr(get_input_descriptor(i).get_output().get_tensor_ptr()); + set_output_type(num_inputs, element::f32, ov::PartialShape{ov::Shape{}}); +} + +void LoopBegin::validate_and_infer_types() { + validate_and_infer_types_except_LoopEnd(); + const auto& last_output_inputs = output(get_output_size() - 1).get_target_inputs(); + NODE_VALIDATION_CHECK(this, last_output_inputs.size() == 1, "LoopBegin must have exactly one input attached to the last output"); + const auto& loop_end = ov::as_type_ptr(last_output_inputs.begin()->get_node()->shared_from_this()); + NODE_VALIDATION_CHECK(this, loop_end != nullptr, "LoopBegin must have LoopEnd connected to its last output"); + dimension = loop_end->get_dimension(); + work_amount = loop_end->get_work_amount(); + increment = loop_end->get_increment(); +} + +std::shared_ptr LoopBegin::get_loop_end() { + const auto& last_output_inputs = output(get_output_size() - 1).get_target_inputs(); + if (last_output_inputs.size() != 1) + throw std::invalid_argument("LoopBegin has more than one inputs attached to the last output"); + const auto& loop_end = ov::as_type_ptr(last_output_inputs.begin()->get_node()->shared_from_this()); + if (!loop_end) + throw std::invalid_argument("LoopBegin last output is not connected to LoopEnd"); + return loop_end; +} + +LoopEnd::LoopEnd(const std::vector> &args, size_t dimension, size_t work_amount, size_t increment, + std::vector apply_increment, std::vector finalization_offsets) + : LoopBase(args, dimension, work_amount, increment), apply_increment(std::move(apply_increment)), + finalization_offsets(std::move(finalization_offsets)), has_outer_loop(true) { + constructor_validate_and_infer_types(); +} + +std::shared_ptr LoopEnd::clone_with_new_inputs(const OutputVector& inputs) const { + return std::make_shared(inputs, dimension, work_amount, increment, apply_increment, finalization_offsets); +} + +std::shared_ptr LoopEnd::get_loop_begin() { + const auto& loop_begin = ov::as_type_ptr(get_input_source_output(get_input_size() - 1).get_node_shared_ptr()); + if (!loop_begin) + throw std::invalid_argument("LoopEnd last input is not connected to LoopBegin"); + return loop_begin; +} + +const std::vector& LoopEnd::get_finalization_offsets() const { + return finalization_offsets; +} + +const std::vector& LoopEnd::get_apply_increment() const { + return apply_increment; +} + +void LoopEnd::set_finalization_offsets(std::vector offsets) { + if (offsets.size() != loop_io_size) + throw std::invalid_argument("LoopEnd set_finalization_offsets is called with inconsistent offsets.size()"); + finalization_offsets = std::move(offsets); +} + +void LoopEnd::set_apply_increment(std::vector allow_increment) { + if (allow_increment.size() != loop_io_size) + throw std::invalid_argument("LoopEnd set_apply_increment is called with inconsistent apply_increment.size()"); + apply_increment = std::move(allow_increment); +} + +void LoopEnd::set_work_amount(size_t new_work_amount) { + work_amount = new_work_amount; + // Update LoopBegin to maintain consistency between the Loops + get_loop_begin()->work_amount = new_work_amount; +} + +void LoopEnd::set_increment(size_t new_increment) { + increment = new_increment; + // Update LoopBegin to maintain consistency between the Loops + get_loop_begin()->increment = new_increment; +} + +void LoopEnd::set_evaluate_once(bool once) { + evaluate_once = once; + // Update LoopBegin to maintain consistency between the Loops + get_loop_begin()->evaluate_once = once; +} + +void LoopEnd::validate_and_infer_types() { + const size_t num_inputs = get_input_size(); + const auto loop_begin = ov::as_type_ptr(input(get_input_size() - 1).get_source_output().get_node_shared_ptr()); + NODE_VALIDATION_CHECK(this, loop_begin != nullptr, "LoopEnd must have LoopBegin as the last argument"); + // Note: have to -2 because the LoopBegin->LoopEnd edge is counted twice + loop_io_size = get_input_size() + loop_begin->get_output_size() - 2; + NODE_VALIDATION_CHECK(this, apply_increment.empty() || apply_increment.size() == loop_io_size, + "apply_increments must be either empty or defined per every input & output of joined Loop. Expected size: ", + loop_io_size, " got ", apply_increment.size()); + NODE_VALIDATION_CHECK(this, finalization_offsets.empty() || finalization_offsets.size() == loop_io_size, + "finalization_offsets must be either empty or defined per every input & output of joined Loop. Expected size: ", + loop_io_size, " got ", finalization_offsets.size()); + if (apply_increment.empty()) + apply_increment.resize(loop_io_size, true); + if (finalization_offsets.empty()) + finalization_offsets.resize(loop_io_size, 0); + set_output_size(num_inputs - 1); + const auto& ins = inputs(); + // All outputs are by-passed from inputs, except for the last one - it connects LoopBegin and LoopEnd + for (int i = 0; i < num_inputs - 1; i++) + get_output_descriptor(i).set_tensor_ptr(get_input_descriptor(i).get_output().get_tensor_ptr()); +} + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/op/loop_helpers.cpp b/src/common/snippets/src/op/loop_helpers.cpp new file mode 100644 index 00000000000000..5882e305087eac --- /dev/null +++ b/src/common/snippets/src/op/loop_helpers.cpp @@ -0,0 +1,48 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "ngraph/op/op.hpp" +#include "snippets/op/loop_helpers.hpp" + +namespace ngraph { +namespace snippets { +namespace op { +std::shared_ptr insertLoopBeginAfterOutputs(const OutputVector& originalOutputs) { + std::vector>> originalChildInputs; + for (const auto& out : originalOutputs) { + originalChildInputs.push_back(out.get_target_inputs()); + } + + auto loop_begin = std::make_shared(originalOutputs); + + for (int i = 0; i < originalChildInputs.size(); i++) { + for (auto& input : originalChildInputs[i]) { + input.replace_source_output(loop_begin->output(i)); + } + } + return loop_begin; +} + +std::shared_ptr insertLoopEndBeforeInputs(const std::vector>& originalInputs, + const std::shared_ptr& loopBegin, + size_t dimension, size_t work_amount, size_t increment, + std::vector apply_increment, + std::vector finalization_offsets) { + OutputVector originalParentOutputs; + for (const auto& in : originalInputs) { + originalParentOutputs.push_back(in.get_source_output()); + } + originalParentOutputs.push_back(loopBegin->output(loopBegin->get_output_size() - 1)); + auto loop_end = std::make_shared(originalParentOutputs, dimension, work_amount, increment, + std::move(apply_increment), std::move(finalization_offsets)); + + for (int i = 0; i < originalInputs.size(); i++) { + originalInputs[i].replace_source_output(loop_end->output(i)); + } + return loop_end; +} + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/op/memory_access.cpp b/src/common/snippets/src/op/memory_access.cpp new file mode 100644 index 00000000000000..79f6b63a4be691 --- /dev/null +++ b/src/common/snippets/src/op/memory_access.cpp @@ -0,0 +1,37 @@ +// Copyright (C) 2018-2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "snippets/op/memory_access.hpp" + +#include + +namespace ngraph { +namespace snippets { +namespace op { + +MemoryAccess::MemoryAccess(const Output& x, const size_t count) : Op({x}), m_count(count) { +} + +bool MemoryAccess::visit_attributes(AttributeVisitor& visitor) { + visitor.on_attribute("count", m_count); + return true; +} + +size_t MemoryAccess::get_count() const { + return m_count; +} + +void MemoryAccess::set_count(const size_t count) { + m_count = count; +} + +void MemoryAccess::validate_and_infer_types() { + set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); +} + +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/op/scalar.cpp b/src/common/snippets/src/op/scalar.cpp index c788c341a3e02f..7a369ee1d163c2 100644 --- a/src/common/snippets/src/op/scalar.cpp +++ b/src/common/snippets/src/op/scalar.cpp @@ -19,4 +19,14 @@ void snippets::op::Scalar::validate_and_infer_types() { NODE_VALIDATION_CHECK(this, out_pshape.get_shape().empty() || ov::shape_size(out_pshape.get_shape()) == 1, "Scalar supports only one-element constants, got ", out_pshape.get_shape(), " shape"); +} + +bool snippets::op::Scalar::visit_attributes(AttributeVisitor& visitor) { + auto shape = get_output_shape(0); + auto type = get_output_element_type(0); + auto value = cast_vector(); + visitor.on_attribute("element_type", type); + visitor.on_attribute("shape", shape); + visitor.on_attribute("value", value); + return true; } \ No newline at end of file diff --git a/src/common/snippets/src/op/store.cpp b/src/common/snippets/src/op/store.cpp index db3204df69ab0b..69e1e1643b769b 100644 --- a/src/common/snippets/src/op/store.cpp +++ b/src/common/snippets/src/op/store.cpp @@ -8,39 +8,20 @@ #include -using namespace std; -using namespace ngraph; +namespace ngraph { +namespace snippets { +namespace op { -snippets::op::Store::Store(const Output& x, const size_t count) : Op({x}), m_count(count) { +Store::Store(const Output& x, const size_t count) : MemoryAccess({x}, count) { constructor_validate_and_infer_types(); } -bool snippets::op::Store::visit_attributes(AttributeVisitor& visitor) { - return true; -} - -std::shared_ptr snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const { +std::shared_ptr Store::clone_with_new_inputs(const OutputVector& new_args) const { INTERNAL_OP_SCOPE(Store); check_new_args_count(this, new_args); return std::make_shared(new_args.at(0), m_count); } -void snippets::op::Store::validate_and_infer_types() { - set_output_type(0, get_input_element_type(0), get_input_partial_shape(0)); -} - -bool snippets::op::Store::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const { - INTERNAL_OP_SCOPE(Store); - NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config"); - NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config"); - NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation"); - NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port"); - NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape"); - NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape"); - - std::copy(input_values[0]->get_data_ptr(), - input_values[0]->get_data_ptr() + shape_size(get_output_shape(0))*output_values[0]->get_element_type().size(), - output_values[0]->get_data_ptr()); - - return true; -} +} // namespace op +} // namespace snippets +} // namespace ngraph \ No newline at end of file diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 72573f5519a089..8cdd858a90a7a1 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -14,6 +14,7 @@ #include "snippets/pass/convert_constants.hpp" #include "snippets/pass/convert_power_to_powerstatic.hpp" #include "snippets/pass/vector_to_scalar.hpp" +#include "snippets/pass/insert_loops.hpp" #include "snippets/pass/transform_convert.hpp" #include "snippets/pass/align_element_type.hpp" #include "snippets/utils.hpp" @@ -62,6 +63,36 @@ std::shared_ptr snippets::op::Subgraph::clone_with_new_inputs(const Output return make_shared(inputs, ov::clone_model(*m_body.get())); } +std::vector snippets::op::Subgraph::reshape_body(const std::vector& input_shapes) { + auto& params = m_body->get_parameters(); + OPENVINO_ASSERT(params.size() == input_shapes.size(), "Got invalid number of input shapes to reshape subgraph body"); + for (size_t i = 0; i < params.size(); ++i) { + params[i]->set_partial_shape(input_shapes[i]); + } + m_body->validate_nodes_and_infer_types(); + std::vector output_shapes; + for (const auto& res : m_body->get_results()) { + output_shapes.emplace_back(res->get_input_partial_shape(0)); + } + return output_shapes; +} + +std::vector snippets::op::Subgraph::reshape_body(const std::vector& input_shapes) { + auto& params = m_body->get_parameters(); + OPENVINO_ASSERT(params.size() == input_shapes.size(), "Got invalid number of input shapes to reshape subgraph body"); + for (size_t i = 0; i < params.size(); ++i) { + params[i]->set_partial_shape(input_shapes[i]); + } + m_body->validate_nodes_and_infer_types(); + std::vector output_shapes; + for (const auto& res : m_body->get_results()) { + auto pshape = res->get_input_partial_shape(0); + OPENVINO_ASSERT(pshape.is_static(), "Subgraph inferred dynamic output shape during reshape with static inputs"); + output_shapes.emplace_back(res->get_input_partial_shape(0).get_shape()); + } + return output_shapes; +} + void snippets::op::Subgraph::validate_and_infer_types() { INTERNAL_OP_SCOPE(Subgraph); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::validate_and_infer_types") @@ -169,7 +200,8 @@ void snippets::op::Subgraph::fill_empty_output_names(const Output& target_ /// * None: all inputs have the same layout /// * Planar + blocked: some inputs have blocked, and some have planar layouts, e.g. + /// Also there is precision aligning inside body of subgraph during canonicalization -Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) { +ov::PartialShape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, + const BlockedShapeVector& inputShapes) { INTERNAL_OP_SCOPE(Subgraph); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize") NODE_VALIDATION_CHECK(this, inputShapes.size() == m_body->get_parameters().size(), @@ -184,31 +216,30 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape return std::get<0>(lhs).size() < std::get<0>(rhs).size(); }); }; - Shape baseShape; + PartialShape baseShape; AxisVector baseOrder; std::tie(baseShape, baseOrder, std::ignore) = getMaxRankBlockedShape(inputShapes); const auto baseRank = baseShape.size(); const bool baseIsBlocked = baseOrder.size() != std::set(baseOrder.begin(), baseOrder.end()).size(); for (size_t i = 0; i < inputShapes.size(); i++) { const auto &blockedShape = inputShapes[i]; - Shape inShape; + PartialShape inShape; AxisVector inOrder; element::Type inType; std::tie(inShape, inOrder, inType) = blockedShape; const auto inRank = inShape.size(); NODE_VALIDATION_CHECK(this, inRank <= baseRank, "Input rank can't be larger than output rank in snippets."); if (inRank < baseRank) { - Shape newShape(baseRank, 1); + PartialShape newShape(ov::Shape(baseRank, 1)); // todo: more complicated logics is needed if we want to merge smth else than blocked and planar - // could be done by PartialShape::broadcast_merge_into, but this way is faster - size_t startOffset = baseRank - inRank; if (baseIsBlocked) { const bool inIsNotBlocked = inOrder.size() == std::set(inOrder.begin(), inOrder.end()).size(); NODE_VALIDATION_CHECK(this, inIsNotBlocked, "Snippets don't support conversion between blocked layouts of different ranks"); - startOffset--; + inShape.insert(inShape.end(), ov::Dimension(1)); } - std::copy(inShape.begin(), inShape.end(), &newShape[startOffset]); - inShape = move(newShape); + NODE_VALIDATION_CHECK(this, PartialShape::broadcast_merge_into(newShape, inShape, ov::op::AutoBroadcastType::NUMPY), + "Failed to broadcast_merge inputs in snippets canonicalization"); + inShape = std::move(newShape); } else { // todo: 4d blocked + 5d planar layouts are not supported: + NODE_VALIDATION_CHECK(this, @@ -219,30 +250,30 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape NODE_VALIDATION_CHECK(this, PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY), "Failed to create broadcastable shapes in snippets canonicalization"); - const auto paramShape = m_body->get_parameters()[i]->get_shape(); + const auto paramShape = m_body->get_parameters()[i]->get_partial_shape(); const auto paramType = m_body->get_parameters()[i]->get_element_type(); if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin())) m_body->replace_parameter(i, std::make_shared(paramType, inShape)); } - m_body->validate_nodes_and_infer_types(); - auto skipStartEndOnes = [](const Shape& shape) { + auto skipStartEndOnes = [](const PartialShape& shape) { auto begin = shape.begin(); auto end = shape.end(); while (begin != end && *begin == 1) begin++; while (begin != end && *(end-1) == 1) end--; - Shape trimmedShape(end - begin, 1); + + PartialShape trimmedShape(std::vector (end - begin, 1)); std::copy(begin, end, trimmedShape.begin()); return trimmedShape; }; // Check that output shapes are broadcastable => can be scheduled const auto& body_results = m_body->get_results(); - PartialShape outPShape = body_results[0]->get_shape(); + PartialShape outPShape = body_results[0]->get_input_partial_shape(0); for (size_t i = 0; i < body_results.size(); i++) { - auto shape_i = body_results[i]->get_shape(); + auto shape_i = body_results[i]->get_input_partial_shape(0); auto outputShape_i = std::get<0>(outputShapes[i]); // Check that the produced output shape corresponds to the passed shape // Some produced shapes may have been changed to be broadcastable (e.g. blocked + planar outputs), @@ -250,9 +281,7 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape PartialShape pShape_i(skipStartEndOnes(shape_i)); bool compatibleWithPassedShape = PartialShape::broadcast_merge_into(pShape_i, skipStartEndOnes(outputShape_i), ::ngraph::op::AutoBroadcastType::NUMPY); - NODE_VALIDATION_CHECK(this, ov::shape_size(shape_i) == ov::shape_size(outputShape_i) && - compatibleWithPassedShape, "Inferred and passed results shapes are incompatible for snippet ", - get_friendly_name(), " : ", shape_i, " vs ", outputShape_i, "."); + NODE_VALIDATION_CHECK(this, compatibleWithPassedShape, "Inferred and passed results shapes are incompatible for snippet "); // Check that output shapes are broadcastable to each other => can be scheduled bool compatibleWithOtherOutputs = PartialShape::broadcast_merge_into(outPShape, shape_i, ::ngraph::op::AutoBroadcastType::NUMPY); @@ -263,8 +292,18 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape // to align precision inside Subgraph body that is supported by Plugin align_element_types(outputShapes, inputShapes); - exec_domain = outPShape.get_shape(); - return exec_domain; + master_shape = outPShape; + return master_shape; +} + +PartialShape snippets::op::Subgraph::get_master_shape() { + auto results = m_body->get_results(); + PartialShape outPShape = results[0]->get_input_partial_shape(0); + for (const auto& r : results) + PartialShape::broadcast_merge_into(outPShape, r->get_input_shape(0), + ::ngraph::op::AutoBroadcastType::NUMPY); + master_shape = outPShape; + return master_shape; } void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes, @@ -307,42 +346,60 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() { INTERNAL_OP_SCOPE(Subgraph); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect") auto skip_matching_domain = [](const std::shared_ptr& n) -> bool { - return n->get_input_shape(0).back() != 1; + const auto& pshape = n->get_input_partial_shape(0); + const auto& last_dim = pshape[pshape.size() - 1]; + return last_dim.is_dynamic() || last_dim.get_length() != 1; }; // At the moment we support only full vector Load/Store and scalar Load/Store so that count is equal to lanes. // Then we are going to support variadic Load/Store with different element count const size_t count = m_generator->get_target_machine()->get_lanes(); + const auto & params = m_body->get_parameters(); + bool inputs_has_dynamic_last_dims = std::any_of(params.begin(), params.end(), + [](const shared_ptr& p){ + return p->get_partial_shape().rbegin()->is_dynamic(); + }); ngraph::pass::Manager manager; manager.register_pass(); manager.register_pass(); manager.register_pass(count); manager.register_pass(count); - manager.register_pass(); - manager.register_pass(); - // Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for - // simple subgraphs where one of the ngraph::op's inputs is broadcasted to match the larger one. However, BroadcastMove - // could also be inserted after the ngraph::op, if the op input don't need broadcasting, but the the output does - // (for example, to match the larger output of a child node). In such cases, Loads (and Stores) should be replaced - // with ScalarLoads (ScalarStores) to avoid invalid read in vector Tile. Graph example: - // Parameter_0 Parameter_1 Parameter_2 - // [1,2,5,16] [1,2,5,1] [1,2,5,1] - // Load BroadcastLoad Load* Scalar - // Add Subtract - // \___________ ___________BroadcastMove - // \ / - // Multiply - // Store - // Result - // Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Tile. - if (!exec_domain.empty() && exec_domain.back() != 1) { - manager.register_pass(); - manager.register_pass(); - manager.get_pass_config()-> - set_callback(skip_matching_domain); - manager.get_pass_config()-> - set_callback(skip_matching_domain); + // todo: presently dynamic pipeline is activated even if the last two dimension are static + // In general, we can use static kernels in this case, but several parameters (src and dst memory pointers for example) + // should be passed as run-time args, so it's a mixed regime: kernel is shape-aware, but some additional runtime args are required + // Presently Broadcasting is organized in the following way: + // * ALL last dims are static => broadcasting is handled via MoveBroadcast and pointer arithmetics (even for dynamic upper dims) + if (!inputs_has_dynamic_last_dims) { + manager.register_pass(); + manager.register_pass(); + // Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for + // simple subgraphs where one of the ngraph::op's inputs is broadcasted to match the larger one. However, BroadcastMove + // could also be inserted after the ngraph::op, if the op input don't need broadcasting, but the output does + // (for example, to match the larger output of a child node). In such cases, Loads (and Stores) should be replaced + // with ScalarLoads (ScalarStores) to avoid invalid read in vector Loop. Graph example: + // Parameter_0 Parameter_1 Parameter_2 + // [1,2,5,16] [1,2,5,1] [1,2,5,1] + // Load BroadcastLoad Load* Scalar + // Add Subtract + // \___________ ___________BroadcastMove + // \ / + // Multiply + // Store + // Result + // Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Loop. + if (master_shape.size() != 0 && master_shape[master_shape.size() - 1] != 1) { + manager.register_pass(); + manager.register_pass(); + manager.get_pass_config()-> + set_callback(skip_matching_domain); + manager.get_pass_config()-> + set_callback(skip_matching_domain); + } + // todo: get_lanes() assumes fp32. Could there be any int8 issues? + // Note that InsertLoops requires validate_and_infer_types afterwards, so add it manually if + // automatic validation will be disabled in the pass manager + manager.register_pass(master_shape, m_generator->get_target_machine()->get_lanes()); } manager.run_passes(m_body); } @@ -371,14 +428,14 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, INTERNAL_OP_SCOPE(Subgraph); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::generate") NGRAPH_CHECK(m_generator != nullptr, "generate is called while generator is not set"); + + convert_to_snippet_dialect(); opt.run_passes(m_body); - // generation flow snippets::pass::AssignRegisters().run_on_model(m_body); // schedule generation should go here and be target agnostic - // actual code emission ngraph::snippets::code ptr = m_generator->generate(m_body, compile_params); @@ -393,7 +450,7 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt, } NGRAPH_CHECK(!constants.size(), "External constants detected. Snippet is illigal for scheduling"); - return {exec_domain, false /*canBeLinearized*/, ptr}; + return {master_shape, false /*canBeLinearized*/, ptr}; } void snippets::op::Subgraph::print() const { diff --git a/src/common/snippets/src/op/tile.cpp b/src/common/snippets/src/op/tile.cpp deleted file mode 100644 index b37e212fdcf88d..00000000000000 --- a/src/common/snippets/src/op/tile.cpp +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/op/tile.hpp" -#include "snippets/generator.hpp" - -using namespace std; -using namespace ngraph; - -snippets::op::Tile::Tile(const std::vector& region, size_t increment, - size_t num_inputs, size_t num_outputs, - const std::vector& io_dims, const std::vector& io_data_sizes) : - Op(), region(region), increment(increment), num_inputs(num_inputs), num_outputs(num_outputs), io_dims(io_dims), io_data_size(io_data_sizes) { -} diff --git a/src/common/snippets/src/op/tile_scheduler.cpp b/src/common/snippets/src/op/tile_scheduler.cpp deleted file mode 100644 index fd0ba9e6a23223..00000000000000 --- a/src/common/snippets/src/op/tile_scheduler.cpp +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright (C) 2018-2022 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "snippets/op/tile_scheduler.hpp" -#include "snippets/generator.hpp" - -ngraph::snippets::op::TileScheduler::TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region) - : Op(), vector_region{vector_region}, scalar_region{scalar_region} { -} diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp index a0429fcfc9996f..7478ed39263ff1 100644 --- a/src/common/snippets/src/pass/assign_registers.cpp +++ b/src/common/snippets/src/pass/assign_registers.cpp @@ -2,81 +2,166 @@ // SPDX-License-Identifier: Apache-2.0 // -// #include #include -#include "snippets/remarks.hpp" - #include "snippets/pass/assign_registers.hpp" #include "snippets/snippets_isa.hpp" -#include - -#include - bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr& f) { RUN_ON_MODEL_SCOPE(AssignRegisters); OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters") using Reg = size_t; + using tensor = std::shared_ptr; auto ops = f->get_ordered_ops(); - decltype(ops) stmts; - std::copy_if(ops.begin(), ops.end(), std::back_inserter(stmts), [](decltype(ops[0]) op) { - return !(std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op)); - }); + // Note that currently there are 3 types of ops: + // * gpr->gpr: (Parameter, Result, LoopBegin, LoopEnd) will also be Buffer? + // * gpr->vec: or vec->gpr Load/LoadConvert, Store/StoreConvert, BroadcastLoad etc. + // * vec->vec: all other "normal" operations that perform calculations on vector registers: Add, BroadcastMove, Power, etc. + enum op_reg_type {gpr2gpr, gpr2vec, vec2gpr, vec2vec}; - size_t rdx = 0; - std::map, Reg> regs; - for (const auto& op : stmts) { - for (const auto& output : op->outputs()) { - regs[output.get_tensor_ptr()] = rdx++; + auto get_op_reg_type = [](const std::shared_ptr& op) { + if (std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op)) + return gpr2gpr; + else if (std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op)) + return gpr2vec; + else if (std::dynamic_pointer_cast(op)) + return vec2gpr; + else + return vec2vec; + }; + std::vector>> typed_ops; + for (const auto& op : ops) + typed_ops.emplace_back(std::make_pair(get_op_reg_type(op), op)); + size_t counter_vec = 0; + size_t counter_gpr = 0; + std::map regs_vec, regs_gpr; + // Define a set of immune tensors that will be ignored by auto reg allocation => their reg allocation is done manually + // todo: presently it hold only gpr's. If you need to manually assign vec's, implement reg_type or create a second map + std::map manually_assigned_regs; + const auto IS_MANUALLY_ALLOCATED_REG = SIZE_MAX; + const auto num_parameters = f->get_parameters().size(); + for (const auto& op : ops) { + if (const auto& param = ov::as_type_ptr(op)) { + manually_assigned_regs[op->output(0).get_tensor_ptr()] = + static_cast(f->get_parameter_index(param)); + } else if (const auto& result = ov::as_type_ptr(op)) { + // here we use the fact that Result input & output tensors are identical by construction + manually_assigned_regs[op->output(0).get_tensor_ptr()] = + static_cast(f->get_result_index(result) + num_parameters); } } - - std::vector> used; - std::vector> def; - - for (const auto& op : stmts) { - std::set u; - for (const auto& input : op->inputs()) { - if (regs.count(input.get_tensor_ptr())) { - u.insert(regs[input.get_tensor_ptr()]); + auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG, &manually_assigned_regs] (const std::shared_ptr& op, + decltype(regs_vec)& reg_map, + size_t& counter) { + for (const auto& output : op->outputs()) { + const auto& t = output.get_tensor_ptr(); + // Note that some ops might have identical input&output tensors (Result and Tile* for ex.) + // so we have to check that the tensor has not been enumerated already + if (reg_map.count(t) == 0) { + reg_map[t] = manually_assigned_regs.count(t) == 0 ? counter++ : IS_MANUALLY_ALLOCATED_REG; } } - used.push_back(u); + }; + for (const auto& t_op : typed_ops) { + switch (t_op.first) { + case vec2vec: + case gpr2vec: + enumerate_out_tensors(t_op.second, regs_vec, counter_vec); + break; + case gpr2gpr: + case vec2gpr: + enumerate_out_tensors(t_op.second, regs_gpr, counter_gpr); + break; + } + } + // todo: make one for gpr and one for vector + std::vector> used_gpr(ops.size(), std::set()); // used = used as an input + std::vector> defined_gpr(ops.size(), std::set()); // defined = used as output + std::vector> used_vec(ops.size(), std::set()); + std::vector> defined_vec(ops.size(), std::set()); - std::set d; - if (!std::dynamic_pointer_cast(op)) { - for (const auto& output : op->outputs()) { - d.insert(regs[output.get_tensor_ptr()]); - } + auto tensor2reg = [IS_MANUALLY_ALLOCATED_REG] (const std::vector& tensors, const std::map& reg_map) { + std::set result; + for (const auto& t : tensors) { + if (reg_map.count(t) == 0) + ngraph::ngraph_error("Assign registers: attempt to access not enumerated tensor"); + Reg reg_id = reg_map.at(t); + if (reg_id != IS_MANUALLY_ALLOCATED_REG) + result.insert(reg_id); + } + return result; + }; + for (int i = 0; i < typed_ops.size(); i++) { + const auto& t_op = typed_ops[i]; + std::vector used_tensors, defined_tensors; + for (const auto& in : t_op.second->inputs()) + used_tensors.push_back(in.get_tensor_ptr()); + for (const auto& out : t_op.second->outputs()) + defined_tensors.push_back(out.get_tensor_ptr()); + switch (t_op.first) { + case vec2vec: + used_vec[i] = tensor2reg(used_tensors, regs_vec); + defined_vec[i] = tensor2reg(defined_tensors, regs_vec); + break; + case gpr2gpr: + used_gpr[i] = tensor2reg(used_tensors, regs_gpr); + defined_gpr[i] = tensor2reg(defined_tensors, regs_gpr); + break; + case gpr2vec: + used_gpr[i] = tensor2reg(used_tensors, regs_gpr); + defined_vec[i] = tensor2reg(defined_tensors, regs_vec); + break; + case vec2gpr: + used_vec[i] = tensor2reg(used_tensors, regs_vec); + defined_gpr[i] = tensor2reg(defined_tensors, regs_gpr); + break; } - def.push_back(d); } // define life intervals - std::vector> lifeIn(stmts.size(), std::set()); - std::vector> lifeOut(stmts.size(), std::set()); + // liveOut[i] - regs that are live on exit from i-th (topologically ordered) operation + // liveIn[i] - regs that are live on entering the i-th (topologically ordered) operation + std::vector> life_in_vec(std::move(used_vec)); + std::vector> life_out_vec(typed_ops.size(), std::set()); + std::vector> life_in_gpr(std::move(used_gpr)); + std::vector> life_out_gpr(typed_ops.size(), std::set()); - for (size_t i = 0; i < stmts.size(); i++) { - for (size_t n = 0; n < stmts.size(); n++) { - std::set_difference(lifeOut[n].begin(), lifeOut[n].end(), def[n].begin(), def[n].end(), std::inserter(lifeIn[n], lifeIn[n].begin())); - lifeIn[n].insert(used[n].begin(), used[n].end()); + // todo: this part if O(N*N), so it's slow for large subgraphs. Can we simplify it? At least add an early stopping criteria + for (size_t i = 0; i < typed_ops.size(); i++) { + for (size_t n = 0; n < typed_ops.size(); n++) { + // Regs that are live on entering the operation = regs used by the op + (all other regs alive - regs defined by the op) + // copy regs from lifeOut to lifeIn while ignoring regs in def + std::set_difference(life_out_gpr[n].begin(), life_out_gpr[n].end(), + defined_gpr[n].begin(), defined_gpr[n].end(), + std::inserter(life_in_gpr[n], life_in_gpr[n].begin())); + std::set_difference(life_out_vec[n].begin(), life_out_vec[n].end(), + defined_vec[n].begin(), defined_vec[n].end(), + std::inserter(life_in_vec[n], life_in_vec[n].begin())); } - for (size_t n = 0; n < stmts.size(); n++) { - auto node = stmts[n]; - if (!std::dynamic_pointer_cast(node)) { - for (const auto& out : node->outputs()) { - for (const auto& port : out.get_target_inputs()) { - auto pos = std::find(stmts.begin(), stmts.end(), port.get_node()->shared_from_this()); - if (pos != stmts.end()) { - auto k = pos-stmts.begin(); - lifeOut[n].insert(lifeIn[k].begin(), lifeIn[k].end()); - } + for (size_t n = 0; n < typed_ops.size(); n++) { + auto op = typed_ops[n].second; + for (const auto& out : op->outputs()) { + for (const auto& port : out.get_target_inputs()) { + auto k = std::find(ops.begin(), ops.end(), port.get_node()->shared_from_this()) - ops.begin(); + if (k == ops.size()) + throw ngraph_error("assign registers can't find target op in the body"); + switch (typed_ops[k].first) { + case vec2vec: + case vec2gpr: + life_out_vec[n].insert(life_in_vec[k].begin(), life_in_vec[k].end()); + break; + case gpr2gpr: + case gpr2vec: + life_out_gpr[n].insert(life_in_gpr[k].begin(), life_in_gpr[k].end()); + break; } } } } } - struct by_starting { auto operator()(const std::pair& lhs, const std::pair& rhs) const -> bool { return lhs.first < rhs.first|| (lhs.first == rhs.first && lhs.second < rhs.second); @@ -88,13 +173,15 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr return lhs.second < rhs.second || (lhs.second == rhs.second && lhs.first < rhs.first); } }; + // A variable live interval - is a range (start, stop) of op indexes, such that + // the variable is alive within this range (defined but not used by the last user) + std::map, Reg, by_starting> live_intervals_vec, live_intervals_gpr; - std::set, by_starting> live_intervals; - - std::reverse(lifeIn.begin(), lifeIn.end()); - auto find_last_use = [lifeIn](int i) -> int { - int ln = static_cast(lifeIn.size()) - 1; - for (auto& x : lifeIn) { + std::reverse(life_in_vec.begin(), life_in_vec.end()); + std::reverse(life_in_gpr.begin(), life_in_gpr.end()); + auto find_last_use = [](decltype(life_in_gpr) life_in, int i) -> int { + int ln = static_cast(life_in.size()) - 1; + for (auto& x : life_in) { if (x.find(i) != x.end()) { return ln; } @@ -102,67 +189,83 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr } return i; }; - - for (size_t i = 0; i < stmts.size(); i++) { - live_intervals.insert(std::make_pair(static_cast(i), find_last_use(static_cast(i)))); + for (int i = 0; i < static_cast(typed_ops.size()); i++) { + for (const auto& def : defined_vec[i]) + live_intervals_vec[std::make_pair(i, find_last_use(life_in_vec, static_cast(def)))] = def; + for (const auto& def : defined_gpr[i]) + live_intervals_gpr[std::make_pair(i, find_last_use(life_in_gpr, static_cast(def)))] = def; } - // http://web.cs.ucla.edu/~palsberg/course/cs132/linearscan.pdf - std::multiset, by_ending> active; - std::map register_map; - std::stack bank; - for (int i = 0; i < 16; i++) bank.push(16-1-i); + auto linescan_assign_registers = [](const decltype(live_intervals_vec)& live_intervals, + const std::set& reg_pool) { + // http://web.cs.ucla.edu/~palsberg/course/cs132/linearscan.pdf + // todo: do we need multimap? <=> can an op have two inputs from the same op? + std::map, Reg, by_ending> active; + // uniquely defined register => reused reg (reduced subset enabled by reg by reusage) + std::map register_map; + std::stack bank; + // regs are stored in ascending order in reg_pool, so walk in reverse to assign them the same way + for (auto rit = reg_pool.crbegin(); rit != reg_pool.crend(); rit++) + bank.push(*rit); - for (auto interval : live_intervals) { - // check expired - while (!active.empty()) { - auto x = *active.begin(); - if (x.second >= interval.first) { - break; + std::pair interval, active_interval; + Reg unique_reg, active_unique_reg; + for (const auto& interval_reg : live_intervals) { + std::tie(interval, unique_reg) = interval_reg; + // check expired + while (!active.empty()) { + std::tie(active_interval, active_unique_reg) = *active.begin(); + // if end of active interval has not passed yet => stop removing actives since they are sorted by end + if (active_interval.second >= interval.first) { + break; + } + active.erase(active_interval); + bank.push(register_map[active_unique_reg]); + } + // allocate + if (active.size() == reg_pool.size()) { + // todo: if it is LoopBegin or LoopEnd that requires gpr, and we don't have any in the pool, + // then assign SIZE_MAX-1 as a flag to spill a reg inside emitter + throw ngraph::ngraph_error("can't allocate registers for a snippet "); + } else { + register_map[unique_reg] = bank.top(); + bank.pop(); + active.insert(interval_reg); } - active.erase(x); - bank.push(register_map[x.first]); - } - // allocate - if (active.size() == 16) { - throw ngraph_error("caanot allocate registers for a snippet "); - } else { - register_map[interval.first] = bank.top(); - bank.pop(); - active.insert(interval); } - } + return register_map; + }; + // todo: vec_/gpr_pool are hardware-specific and should be provided by a backend, e.g. overloaded generator + std::set vec_pool; + for (Reg i = 0; i < 16; i++) + vec_pool.insert(i); + auto unique2reused_map_vec = linescan_assign_registers(live_intervals_vec, vec_pool); + std::set gpr_pool(std::move(vec_pool)); + for (const auto& t_reg : manually_assigned_regs) + gpr_pool.erase(t_reg.second); + auto unique2reused_map_gpr = linescan_assign_registers(live_intervals_gpr, gpr_pool); - std::map, Reg> physical_regs; + std::map assigned_regs(std::move(manually_assigned_regs)); + auto register_assigned_regs = [IS_MANUALLY_ALLOCATED_REG, &assigned_regs](const std::map& unique_regs, + const std::map& unique2reused) { + for (const auto& reg : unique_regs) { + if (reg.second == IS_MANUALLY_ALLOCATED_REG) + continue; + if (unique2reused.count(reg.second) == 0) + ngraph::ngraph_error("Assign registers failed to allocate register for a tensor"); + assigned_regs[reg.first] = unique2reused.at(reg.second); + } + }; + register_assigned_regs(regs_vec, unique2reused_map_vec); + register_assigned_regs(regs_gpr, unique2reused_map_gpr); - for (const auto& reg : regs) { - physical_regs[reg.first] = register_map[reg.second]; - } - const auto num_parameters = f->get_parameters().size(); - for (const auto& n : f->get_ordered_ops()) { - auto& rt = n->get_rt_info(); - std::vector regs; - regs.reserve(n->outputs().size()); - /* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are - * then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details. - * Note also that Parameter and Result store general-purpose register index, because they work with memory - * (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are - * performed on registers. - */ - if (is_type(n)) { - continue; - } else if (const auto& param = ov::as_type_ptr(n)) { - regs.push_back(f->get_parameter_index(param)); - } else if (const auto& store = ov::as_type_ptr(n)) { - regs.push_back(f->get_result_index(store) + num_parameters); - } else { - for (const auto& output : n->outputs()) { - auto allocated = physical_regs[output.get_tensor_ptr()]; - regs.push_back(allocated); - } + for (const auto& t_op : typed_ops) { + for (const auto& out : t_op.second->outputs()) { + const auto& t = out.get_tensor_ptr(); + auto& rt = t->get_rt_info(); + rt["reginfo"] = static_cast(assigned_regs[t]); } - rt["reginfo"] = regs; } - return false; } + diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp index 57c737f992b89e..592601f681c682 100644 --- a/src/common/snippets/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/src/pass/collapse_subgraph.cpp @@ -14,6 +14,7 @@ #include #include #include "transformations/utils/utils.hpp" +#include "ngraph/op/util/attr_types.hpp" #include #include @@ -32,29 +33,15 @@ namespace pass { namespace { auto outputs_are_not_broadcastable(const std::shared_ptr& node) -> bool { - auto outputs = node->outputs(); - auto find_smallest_output_shape = [](const std::vector>& outputs) -> Shape { - return std::accumulate(std::begin(outputs), std::end(outputs), ngraph::Shape(outputs.begin()->get_shape()), - [](Shape& other_shape, const Output& output){ - return shape_size(output.get_shape()) < shape_size(other_shape) ? output.get_shape() : other_shape; - }); - }; - auto ref_shape = find_smallest_output_shape(outputs); - - auto check_shapes_broadcastable = [ref_shape](const Output& output) -> bool { - auto other_shape = output.get_shape(); - - if (other_shape.size() != ref_shape.size()) { - return false; - } - - return std::inner_product(std::begin(other_shape), std::end(other_shape), std::begin(ref_shape), true, - std::logical_and(), [](Shape::value_type lsh, Shape::value_type rsh){ - return rsh == 1 || lsh == rsh; - }); - }; - - return std::find_if_not(std::begin(outputs), std::end(outputs), check_shapes_broadcastable) != std::end(outputs); + const auto& outputs = node->outputs(); + if (outputs.size() <= 1) + return false; + ov::PartialShape ref_shape = outputs.front().get_partial_shape(); + bool success = true; + for (int i = 1; i < outputs.size() && success; i++) { + success &= ov::PartialShape::broadcast_merge_into(ref_shape, outputs[i].get_partial_shape(), ov::op::AutoBroadcastType::NUMPY); + } + return !success; } auto is_supported_op(const std::shared_ptr &n) -> bool { diff --git a/src/common/snippets/src/pass/convert_constants.cpp b/src/common/snippets/src/pass/convert_constants.cpp index 3cb791d0130163..3c2e8cee2a7a6f 100644 --- a/src/common/snippets/src/pass/convert_constants.cpp +++ b/src/common/snippets/src/pass/convert_constants.cpp @@ -20,11 +20,14 @@ ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() { ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) { OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars") auto constant = as_type_ptr(m.get_match_root()); - auto scalar = std::make_shared(*constant); + if (ov::shape_size(constant->get_output_shape(0)) != 1) + return false; + // Note that all Constants {1,1,1,1} are converted to Scalar {1} here + // This is needed to simplify shape inference, otherwise {1,1,1,1} Constants can increase output rank + auto scalar = std::make_shared(ov::op::v0::Constant(*constant, ov::Shape{1})); scalar->set_friendly_name(constant->get_friendly_name()); ngraph::copy_runtime_info(constant, scalar); ngraph::replace_node(constant, scalar); - return true; }; register_matcher(std::make_shared(constants), callback); diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp new file mode 100644 index 00000000000000..30c9a20883b8d5 --- /dev/null +++ b/src/common/snippets/src/pass/insert_loops.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2022 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include "snippets/pass/insert_loops.hpp" +#include "snippets/op/loop_helpers.hpp" + +#include + +ngraph::snippets::pass::InsertLoops::InsertLoops(ov::PartialShape master_shape, size_t vector_size) +: master_shape(std::move(master_shape)), vector_size(vector_size) { +} + +bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr &model) { + RUN_ON_FUNCTION_SCOPE(InsertLoops); + if (master_shape.is_dynamic()) + throw ngraph_error("InsertLoops doesn't support dynamic shapes yet"); + + const auto inner_dim = master_shape.size() - 1; + // Note: outer_dim could overflow if master_shape.size() < 2 + const auto outer_dim = master_shape.size() - 2; + const auto inner_work_amount = master_shape[inner_dim].get_length(); + const auto outer_work_amount = master_shape.size() >= 2 ? master_shape[outer_dim].get_length() : 1; + + ParameterVector commonParams = model->get_parameters(); + // Note that topological sort parses node arguments in reversed order, but results are added - in direct order + // So ve need to pass the reversed results to LoopEnd to keep the original traversal order in topological sorter + const auto& orig_results = model->get_results(); + ResultVector commonResults(orig_results.rbegin(), orig_results.rend()); + std::vector ioShapes; + ioShapes.reserve(commonParams.size() + commonResults.size()); + std::transform(commonParams.begin(), commonParams.end(), std::back_inserter(ioShapes), + [](const std::shared_ptr& n) { return n->get_output_partial_shape(0); }); + std::transform(commonResults.begin(), commonResults.end(), std::back_inserter(ioShapes), + [](const std::shared_ptr& n) { return n->get_input_partial_shape(0); }); + + if (inner_work_amount > 0) { + std::vector apply_increments; + apply_increments.reserve(ioShapes.size()); + // Inner Loop applies increments if a dimension is not broadcasted + std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments), + [=](const PartialShape& ps) { + return ps[inner_dim] != 1 && master_shape[inner_dim] != 1; + }); + std::vector inner_finalization_offsets(ioShapes.size(), 0); + if (outer_work_amount > 1) { + // We need to step back if an outer dim is broadcasted, while the corresponding lower one is not + std::transform(ioShapes.begin(), ioShapes.end(), inner_finalization_offsets.begin(), + [=](const PartialShape& ps) { + return ps[outer_dim] == 1 && ps[inner_dim] != 1 ? -inner_work_amount : 0; + }); + } + const auto& inner_loop_begin = op::insertLoopBegin(commonParams); + const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_dim, inner_work_amount, + vector_size, apply_increments, inner_finalization_offsets); + // set internal flag to enable scalar vs vector loop optimizations + inner_loop_end->has_outer_loop = outer_work_amount > 1; + // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in + // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called + // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg + // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency + // on LoopBegin to guarantee that the constants are executed inside the Loop. + for (const auto& n : model->get_ordered_ops()) { + if (auto c = std::dynamic_pointer_cast(n)) + c->add_control_dependency(inner_loop_begin); + else if (n == inner_loop_begin) + break; + } + } + + if (outer_work_amount > 1) { + std::vector apply_increments; + apply_increments.reserve(ioShapes.size()); + // Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1) + std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments), + [=](const PartialShape& ps) { + return ps[outer_dim] != 1 && ps[inner_dim] == 1; + }); + const auto& outer_loop_begin = op::insertLoopBegin(commonParams); + insertLoopEnd(commonResults, outer_loop_begin, outer_dim, outer_work_amount, 1, apply_increments); + } + + return true; +} \ No newline at end of file diff --git a/src/common/snippets/src/pass/insert_movebroadcast.cpp b/src/common/snippets/src/pass/insert_movebroadcast.cpp index 23740a8aa03711..499be69e67f062 100644 --- a/src/common/snippets/src/pass/insert_movebroadcast.cpp +++ b/src/common/snippets/src/pass/insert_movebroadcast.cpp @@ -18,16 +18,16 @@ using namespace ngraph; namespace { std::shared_ptr broadcast_node_last_dim(const ngraph::Output& value, - const ov::Shape& target_shape, const ov::Shape& normalized_shape) { + const ov::PartialShape& target_shape, const ov::PartialShape& normalized_shape) { std::shared_ptr broadcasted_node = value.get_node_shared_ptr(); - if (target_shape == value.get_shape()) { + if (target_shape == value.get_partial_shape()) { return broadcasted_node; } // Insert BroadcastMove only if the last dimension needs to be broadcasted. Higher-level dims broadcasting - // will be handled by pointer arithmetics in TileScheduler + // will be handled by pointer arithmetics inside outer LoopEmitter if (*target_shape.rbegin() != *normalized_shape.rbegin()) { - ov::Shape broadcasted_shape = normalized_shape; + ov::PartialShape broadcasted_shape = normalized_shape; *broadcasted_shape.rbegin() = *target_shape.rbegin(); broadcasted_node = std::make_shared(broadcasted_node, broadcasted_shape); } @@ -36,20 +36,20 @@ std::shared_ptr broadcast_node_last_dim(const ngraph::Output> get_numpy_broadcast_shapes(const std::vector& input_shapes) { +std::pair> get_numpy_broadcast_partial_shapes(const std::vector& input_shapes) { ov::PartialShape target_shape = input_shapes.front(); for (auto i = 1; i < input_shapes.size(); i++) { if (!ov::PartialShape::broadcast_merge_into(target_shape, input_shapes[i], op::AutoBroadcastType::NUMPY)) throw ngraph::ngraph_error("InsertMoveBroadcast: Failed broadcast-merge input shapes"); } - std::vector normalized_shapes; + std::vector normalized_shapes; for (const auto& input : input_shapes) { - ov::Shape padded_shape{input}; + ov::PartialShape padded_shape{input}; padded_shape.insert(padded_shape.begin(), target_shape.size() - padded_shape.size(), 1); normalized_shapes.push_back(std::move(padded_shape)); } - return {target_shape.get_shape(), normalized_shapes}; + return {target_shape, normalized_shapes}; } } // namespace @@ -72,15 +72,20 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() { } return false; }; - std::vector input_shapes; + std::vector input_shapes; std::vector ignore_as_scalar; for (const auto& val : values) { - input_shapes.emplace_back(val.get_shape()); + input_shapes.emplace_back(val.get_partial_shape()); ignore_as_scalar.push_back(is_scalar_constant(val)); + // Do not insert MoveBroadcast if any of the last dims is dynamic, + // since we don't know if we really need it. In these cases, broadcasting will be performed + // by outer Loop based on runtime shapes. + if (!ignore_as_scalar.back() && !input_shapes.back().rbegin()->is_static()) + return false; } // find the output tensor's shape, then broadcast all inputs so that they are compatible with respect to the last dim - auto bcast_shapes = get_numpy_broadcast_shapes(input_shapes); + auto bcast_shapes = get_numpy_broadcast_partial_shapes(input_shapes); ngraph::OutputVector broadcasted_inputs; for (size_t i = 0; i < values.size(); ++i) { diff --git a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp index f63f9c933af14b..f3765e471971a2 100644 --- a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp +++ b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp @@ -34,8 +34,8 @@ ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBro return false; } - auto inshape = root->input(0).get_shape(); - auto outshape = root->output(0).get_shape(); + auto inshape = root->input(0).get_partial_shape(); + auto outshape = root->output(0).get_partial_shape(); auto broadcastload = std::make_shared(param, outshape); ngraph::copy_runtime_info(root, broadcastload); diff --git a/src/common/snippets/tests/include/lowering_utils.hpp b/src/common/snippets/tests/include/lowering_utils.hpp index 5af4af2a32b099..c156534d180843 100644 --- a/src/common/snippets/tests/include/lowering_utils.hpp +++ b/src/common/snippets/tests/include/lowering_utils.hpp @@ -38,10 +38,15 @@ class DummyGenerator : public ngraph::snippets::Generator { }; class LoweringTests : public TransformationTestsF { +public: + void SetUp() override; + void TearDown() override; protected: static std::shared_ptr getSubgraph(const std::shared_ptr& f); - static std::shared_ptr getLoweredSubgraph(const std::shared_ptr& f); + static std::shared_ptr getLoweredSubgraph(const std::shared_ptr& f, + const ov::PartialShape& master_shape); static std::shared_ptr getTokenizedSubgraph(const std::shared_ptr& f); + ov::PartialShape master_shape{}; }; } // namespace snippets diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp index 4aab86d5d7c07c..ec644e62e514e1 100644 --- a/src/common/snippets/tests/src/lowering_utils.cpp +++ b/src/common/snippets/tests/src/lowering_utils.cpp @@ -30,8 +30,31 @@ DummyTargetMachine::DummyTargetMachine() { jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = dummy_functor; jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor; - jitters[ngraph::snippets::op::Tile::get_type_info_static()] = dummy_functor; - jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor; + jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor; +} + +void LoweringTests::SetUp() { + manager.register_pass(); +} + +void LoweringTests::TearDown() { + auto cloned_function = ngraph::clone_function(*function); + if (!function_ref) { + function_ref = cloned_function; + } + manager.run_passes(function); + ASSERT_NO_THROW(check_rt_info(function)); + + if (comparator.should_compare(FunctionsComparator::ACCURACY)) { + auto acc_comparator = FunctionsComparator::no_default(); + acc_comparator.enable(FunctionsComparator::CmpValues::ACCURACY); + auto res = acc_comparator.compare(function, cloned_function); + ASSERT_TRUE(res.valid) << res.message; + comparator.disable(FunctionsComparator::CmpValues::ACCURACY); + } + auto res = comparator.compare(function, function_ref); + ASSERT_TRUE(res.valid) << res.message; } std::shared_ptr LoweringTests::getSubgraph(const std::shared_ptr& f) { @@ -52,9 +75,11 @@ std::shared_ptr LoweringTests::getSubgraph(const return subgraph; } -std::shared_ptr LoweringTests::getLoweredSubgraph(const std::shared_ptr &f) { +std::shared_ptr LoweringTests::getLoweredSubgraph(const std::shared_ptr &f, + const ov::PartialShape& master_shape) { auto subgraph = getTokenizedSubgraph(f); subgraph->set_generator(std::make_shared()); + subgraph->set_master_shape(master_shape); subgraph->generate(); return subgraph; } diff --git a/src/common/snippets/tests/src/pass/canonicalization.cpp b/src/common/snippets/tests/src/pass/canonicalization.cpp index 15c33e6df96e10..7b687bad226443 100644 --- a/src/common/snippets/tests/src/pass/canonicalization.cpp +++ b/src/common/snippets/tests/src/pass/canonicalization.cpp @@ -23,12 +23,12 @@ std::string CanonicalizationTests::getTestCaseName(testing::TestParamInfoGetParam(); input_blocked_shapes = {std::get<1>(inputs[0]), std::get<1>(inputs[1])}; - snippets_function = std::make_shared(std::vector{std::get<0>(inputs[0]), std::get<0>(inputs[1])}); + snippets_function = std::make_shared(std::vector{std::get<0>(inputs[0]), std::get<0>(inputs[1])}); } TEST_P(CanonicalizationTests, Add) { @@ -50,8 +50,9 @@ TEST_P(CanonicalizationTests, Add) { function_ref = snippets_function->getReference(); auto subgraph = getTokenizedSubgraph(function); subgraph->set_generator(std::make_shared()); - Shape canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes); - ASSERT_DIMS_EQ(canonical_output_shape, expected_output_shape); + auto canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes); + ASSERT_TRUE(canonical_output_shape.is_static()); + ASSERT_DIMS_EQ(canonical_output_shape.get_shape(), expected_output_shape); } namespace CanonicalizationTestsInstantiation { diff --git a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp index 3e578119b25d19..aa26ecfe4cdb74 100644 --- a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp +++ b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp @@ -20,56 +20,56 @@ void CollapseSubgraphTests::run() { } TEST_F(CollapseSubgraphTests, smoke_Snippets_Eltwise) { - const auto &f = EltwiseFunction(std::vector {{2, 3}, {1, 3}}); + const auto &f = EltwiseFunction(std::vector {{2, 3}, {1, 3}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_MatMulWithEltwise) { - const auto &f = MatMulEltwiseBranchesFunction(std::vector {{1, 3, 4, 4}, {1, 3, 4, 4}}); + const auto &f = MatMulEltwiseBranchesFunction(std::vector {{1, 3, 4, 4}, {1, 3, 4, 4}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_AvoidLoopEltwise) { - const auto &f = EltwiseLogLoopFunction(std::vector {{2, 5}, {2, 1}}); + const auto &f = EltwiseLogLoopFunction(std::vector {{2, 5}, {2, 1}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_OneConvert) { - const auto &f = ConvertFunction(std::vector{{2, 5}}); + const auto &f = ConvertFunction(std::vector{{2, 5}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertInput) { - const auto &f = ConvertInputFunction(std::vector{{2, 5}, {1, 5}}); + const auto &f = ConvertInputFunction(std::vector{{2, 5}, {1, 5}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertOutput) { - const auto &f = ConvertOutputFunction(std::vector{{2, 5}, {1, 5}}); + const auto &f = ConvertOutputFunction(std::vector{{2, 5}, {1, 5}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertStub) { - const auto &f = ConvertStubFunction(std::vector{{2, 5, 2}, {1, 5, 1}}); + const auto &f = ConvertStubFunction(std::vector{{2, 5, 2}, {1, 5, 1}}); function = f.getOriginal(); function_ref = f.getReference(); run(); } TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) { - const auto &f = ConvertPartialInputsAndResultsFunction(std::vector{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}}, + const auto &f = ConvertPartialInputsAndResultsFunction(std::vector{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}}, std::vector{ov::element::i8, ov::element::bf16, ov::element::f32}, std::vector{ov::element::f32, ov::element::i8}); function = f.getOriginal(); diff --git a/src/common/snippets/tests/src/pass/insert_load_store.cpp b/src/common/snippets/tests/src/pass/insert_load_store.cpp index 9913225763b729..c76cc1ae3e26b1 100644 --- a/src/common/snippets/tests/src/pass/insert_load_store.cpp +++ b/src/common/snippets/tests/src/pass/insert_load_store.cpp @@ -25,16 +25,20 @@ std::string InsertLoadStoreTests::getTestCaseName(testing::TestParamInfo inputShapes(3); std::vector broadcastShapes(3); std::tie(inputShapes[0], inputShapes[1], inputShapes[2], broadcastShapes[0], broadcastShapes[1], broadcastShapes[2]) = this->GetParam(); - snippets_function = std::make_shared(inputShapes, broadcastShapes); + snippets_function = std::make_shared( + std::vector {inputShapes[0], inputShapes[1], inputShapes[2]}, broadcastShapes); + master_shape = inputShapes[0]; } TEST_P(InsertLoadStoreTests, ThreeInputsEltwise) { - auto subgraph = getLoweredSubgraph(snippets_function->getOriginal()); + PartialShape scheduler_shape({master_shape[master_shape.size() - 2], + master_shape[master_shape.size() - 1]}); + auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape); function = subgraph->get_body(); function_ref = snippets_function->getLowered(); } diff --git a/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp b/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp index 9be1c569a81b26..f4f3250530865c 100644 --- a/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp +++ b/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp @@ -24,15 +24,22 @@ std::string InsertMoveBroadcastTests::getTestCaseName(testing::TestParamInfo inputShapes(2); std::vector broadcastShapes(2); std::tie(inputShapes[0], inputShapes[1], broadcastShapes[0], broadcastShapes[1]) = this->GetParam(); - snippets_function = std::make_shared(inputShapes, broadcastShapes); + snippets_function = std::make_shared(std::vector {inputShapes[0], inputShapes[1]}, broadcastShapes); + if (inputShapes[0].size() != inputShapes[1].size()) + IE_THROW() << "Expected input shapes of the same size"; + master_shape = {}; + for (int i = 0; i < inputShapes[0].size(); i++) + master_shape.push_back(static_cast(std::max(inputShapes[0][i], inputShapes[1][i]))); } TEST_P(InsertMoveBroadcastTests, AddBroadcast) { - auto subgraph = getLoweredSubgraph(snippets_function->getOriginal()); + PartialShape scheduler_shape({master_shape[master_shape.size() - 2], + master_shape[master_shape.size() - 1]}); + auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape); function = subgraph->get_body(); function_ref = snippets_function->getLowered(); } diff --git a/src/common/snippets/tests/src/registers.cpp b/src/common/snippets/tests/src/registers.cpp index 2eb5cddd84fb9f..4b53f0e8092f67 100644 --- a/src/common/snippets/tests/src/registers.cpp +++ b/src/common/snippets/tests/src/registers.cpp @@ -33,6 +33,8 @@ TEST(TransformationTests, AssignRegisters) { auto s00 = std::make_shared(y02); s00->set_friendly_name("y03"); s00->set_friendly_name("s00"); f = std::make_shared(NodeVector{s00}, ParameterVector{p0, p1}); + // Note that testing the result is not strictly necessary, since the Result doesn't emit any code + f->get_result()->set_friendly_name("r00"); pass::Manager m; m.register_pass(); @@ -52,18 +54,19 @@ TEST(TransformationTests, AssignRegisters) { {"y01", 1}, {"y02", 2}, {"s00", 2}, // gpr + {"r00", 2} // gpr }; auto total_ops = 0; for (auto& op : f->get_ordered_ops()) { - auto& rt = op->get_rt_info(); - - auto it_rinfo = rt.find("reginfo"); - if (it_rinfo != rt.end()) { - auto reginfo = it_rinfo->second.as>(); - auto reg = reginfo[0]; - ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg); - total_ops++; + for (const auto& output : op->outputs()) { + const auto& rt = output.get_tensor_ptr()->get_rt_info(); + auto it_rt = rt.find("reginfo"); + if (it_rt != rt.end()) { + auto reg = it_rt->second.as(); + ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg); + total_ops++; + } } } ASSERT_EQ(total_ops, ref_registers.size()); @@ -120,6 +123,7 @@ TEST(TransformationTests, AssignRegisters2) { s00->set_friendly_name("s00"); f = std::make_shared(NodeVector{s00}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7}); + f->get_result()->set_friendly_name("res00"); pass::Manager m; m.register_pass(); @@ -140,17 +144,19 @@ TEST(TransformationTests, AssignRegisters2) { {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6}, {"r24", 1}, {"s00", 8}, + {"res00", 8} }; auto total_ops = 0; for (auto& op : f->get_ordered_ops()) { - auto& rt = op->get_rt_info(); - auto it_rinfo = rt.find("reginfo"); - if (it_rinfo != rt.end()) { - auto reginfo = it_rinfo->second.as>(); - auto reg = reginfo[0]; - ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg); - total_ops++; + for (const auto& output : op->outputs()) { + const auto& rt = output.get_tensor_ptr()->get_rt_info(); + auto it_rt = rt.find("reginfo"); + if (it_rt != rt.end()) { + auto reg = it_rt->second.as(); + ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg); + total_ops++; + } } } ASSERT_EQ(total_ops, ref_registers.size()); diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp index bba788545f2087..634a9e5b38c3ea 100644 --- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp @@ -123,8 +123,8 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_ jitters[ngraph::op::v7::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v7_emitter); jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter); - jitters[ngraph::snippets::op::Tile::get_type_info_static()] = CREATE_EMITTER(TileEmitter); - jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = CREATE_EMITTER(TileSchedulerEmitter); + jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = CREATE_EMITTER(LoopBeginEmitter); + jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = CREATE_EMITTER(LoopEndEmitter); } size_t ov::intel_cpu::CPUTargetMachine::get_lanes() const { @@ -141,7 +141,9 @@ bool ov::intel_cpu::CPUTargetMachine::is_supported() const { } code ov::intel_cpu::CPUTargetMachine::get_snippet() const { - h->create_kernel(); + if (h->create_kernel() != status::success) { + IE_THROW() << "Failed to create jit_kernel in get_snippet()"; + } return h->jit_ker(); } diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp index 2130457847f3a2..3dc0a1e043d2a7 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp @@ -7,8 +7,10 @@ #include #include "jit_snippets_emitters.hpp" +#include "snippets/op/subgraph.hpp" using namespace Xbyak; +using ngraph::snippets::op::Subgraph; namespace ov { namespace intel_cpu { @@ -23,57 +25,70 @@ jit_container_emitter::jit_container_emitter(dnnl::impl::cpu::x64::jit_generator in_out_type_ = emitter_in_out_map::gpr_to_gpr; } -void jit_container_emitter::map_abstract_registers(const std::vector &vec_pool, const std::vector &gpr_pool, - std::set& vecs_used, std::set& gprs_used) { - if (body.empty()) - IE_THROW() << "Cannot map registers for jit_container_emitter when its body is empty"; - auto abstract_to_physical = [](const std::vector& abstract_regs, const std::vector& regs_pool) { +void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool, mapping_info& vec_map_pool, + std::vector& allocated_emitters) const { + if (allocated_emitters.empty()) + IE_THROW() << "Cannot map registers when there is no allocated_emitters provided"; + auto map_regs = [](const std::vector& abstract_regs, mapping_info& mapping) { + auto& abstract_to_physical = mapping.first; + auto& regs_pool = mapping.second; std::vector physical_regs(abstract_regs.size()); - for (size_t i = 0; i < abstract_regs.size(); i++) - physical_regs[i] = regs_pool.at(abstract_regs[i]); + for (size_t i = 0; i < abstract_regs.size(); i++) { + const auto abstract = abstract_regs[i]; + auto& physical = physical_regs[i]; + if (abstract_to_physical.count(abstract) == 0) { + if (regs_pool.empty()) + IE_THROW() << "Cannot map registers for jit_container_emitter: not enough regs in the pool"; + physical = regs_pool.back(); + regs_pool.pop_back(); + abstract_to_physical[abstract] = physical; + } else { + physical = abstract_to_physical[abstract]; + } + } return physical_regs; }; - for (auto& code : body) { + + for (auto& code : allocated_emitters) { const auto& emitter = code.first; std::vector in_abstract_regs, out_abstract_regs; std::tie(in_abstract_regs, out_abstract_regs) = code.second; std::vector in_physical_regs, out_physical_regs; switch (std::dynamic_pointer_cast(emitter)->get_in_out_type()) { case gpr_to_gpr: - // Note that gpr_to_gpr is used for high-level utility operations like Kernel/TileScheduler/Tile. + // Note that gpr_to_gpr is used for high-level utility operations like Kernel/Loop. // Input registers are not mapped in this case, since they contain utility info - // (num_params, tile increment, etc.), but not reg indexes. - in_physical_regs = std::move(in_abstract_regs); - out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool)); - gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + // (num_params, loop increment, etc.), but not reg indexes. + // todo: Note that LoopBeginEmitter and LoopEndEmitter demonstrate new paradigm, + // where all utility emitters align with conventional Op emitters + if (std::dynamic_pointer_cast(emitter) || + std::dynamic_pointer_cast(emitter)) + in_physical_regs = std::move(map_regs(in_abstract_regs, gpr_map_pool)); + else + in_physical_regs = std::move(in_abstract_regs); + out_physical_regs = std::move(map_regs(out_abstract_regs, gpr_map_pool)); break; case gpr_to_vec: // Load Emitters - in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, gpr_pool)); - out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool)); - gprs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); - vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + in_physical_regs = std::move(map_regs(in_abstract_regs, gpr_map_pool)); + out_physical_regs = std::move(map_regs(out_abstract_regs, vec_map_pool)); break; case vec_to_gpr: // Store Emitters - in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool)); - out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool)); - vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); - gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + in_physical_regs = std::move(map_regs(in_abstract_regs, vec_map_pool)); + out_physical_regs = std::move(map_regs(out_abstract_regs, gpr_map_pool)); break; case vec_to_vec: // Regular operations - in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool)); - out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool)); - vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end()); - vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end()); + in_physical_regs = std::move(map_regs(in_abstract_regs, vec_map_pool)); + out_physical_regs = std::move(map_regs(out_abstract_regs, vec_map_pool)); break; default: IE_THROW() << "Unhandled in_out type"; } code.second = std::make_pair(in_physical_regs, out_physical_regs); if (auto container = std::dynamic_pointer_cast(code.first)) - container->map_abstract_registers(vec_pool, gpr_pool, vecs_used, gprs_used); + container->map_abstract_registers(gpr_map_pool, vec_map_pool, allocated_emitters); } } @@ -84,15 +99,18 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: IE_THROW() << "KernelEmitter invoked with invalid op argument"; if (kernel->region.empty()) IE_THROW() << "KernelEmitter invoked with empty body"; + if (kernel->compile_params == nullptr) + IE_THROW() << "KernelEmitter invoked with op::Kernel that contains no compile_params"; body = kernel->region; - if (!kernel->compile_params) - IE_THROW() << "KernelEmitter invoked without compile_params"; jcp = *reinterpret_cast(kernel->compile_params); // Initialize pools of gp and vec registers gp_regs_pool.resize(16); vec_regs_pool.resize(16); - std::iota(gp_regs_pool.begin(), gp_regs_pool.end(), 0); - std::iota(vec_regs_pool.begin(), vec_regs_pool.end(), 0); + // It's easier to remove the last item during mapping, so fill descending to map ascending + for (size_t i = 0; i < 16; i++) + gp_regs_pool[i] = vec_regs_pool[i] = 15 - i; + // todo: it's more convenient to use std::set as a pool container (unique and always sorted), + // but pools are vectors to align with emit_code signature. Change signature? auto remove_regs_from_pool = [](std::vector& pool, const std::set& to_remove) { // It's important to keep the order of other elements pool.erase(std::remove_if(pool.begin(), pool.end(), @@ -101,14 +119,27 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl: // Reserve stack base and pointer for push(...) and pop(...) operations // Reserve abi_param1 and abi_param2, since they'll be used to pass runtime call args to kernel remove_regs_from_pool(gp_regs_pool, {Xbyak::Operand::RSP, Xbyak::Operand::RBP, - static_cast(abi_param1.getIdx()), - static_cast(abi_param2.getIdx())}); - std::set vecs_used, gprs_used; - map_abstract_registers(vec_regs_pool, gp_regs_pool, vecs_used, gprs_used); - remove_regs_from_pool(gp_regs_pool, gprs_used); - remove_regs_from_pool(vec_regs_pool, vecs_used); - // Remember used gprs to pass it to the TileSchedulerEmitter, so it can init them with appropriate data ptrs - gp_regs_used = std::vector(gprs_used.begin(), gprs_used.end()); + reg_indexes_idx, reg_const_params_idx}); + + mapping_info gpr_map_pool({}, gp_regs_pool); + mapping_info vec_map_pool({}, vec_regs_pool); + std::vector data_io_emitters; + std::copy_if(body.begin(), body.end(), std::back_inserter(data_io_emitters), + [](const AllocatedEmitter& code){ + const auto& emitter = code.first; + const auto emitter_type = std::dynamic_pointer_cast(emitter)->get_in_out_type(); + return emitter_type == gpr_to_vec || emitter_type == vec_to_gpr; + }); + // Note that we can't use reg_indexes_idx or reg_const_params_idx to store data pointers because these two + // regs are used to calculate offsets for the data pointers + map_abstract_registers(gpr_map_pool, vec_map_pool, data_io_emitters); + for (const auto& abstract_to_physical : gpr_map_pool.first) + data_ptr_regs_idx.push_back(abstract_to_physical.second); + // However we can use reg_indexes_idx and reg_const_params_idx for other operations since we won't need them + // after offsets calculation + gpr_map_pool.second.push_back(reg_indexes_idx); + gpr_map_pool.second.push_back(reg_const_params_idx); + map_abstract_registers(gpr_map_pool, vec_map_pool, body); } void KernelEmitter::emit_code(const std::vector &in, @@ -126,263 +157,211 @@ void KernelEmitter::validate_arguments(const std::vector &in, if (in.size() != 2) IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size(); if (!out.empty()) - IE_THROW() << "KKernelEmitter got invalid number of outputs. Expected 0, got " << out.size(); + IE_THROW() << "KernelEmitter got invalid number of outputs. Expected 0, got " << out.size(); + const auto num_params = in[0] + in[1]; + // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount + if (data_ptr_regs_idx.size() != num_params) + IE_THROW() << "KernelEmitter arguments are inconsistent with the gpr_regs_used size: in[0] + in[1] = " + << num_params << " data_ptr_regs_idx.size() = " << data_ptr_regs_idx.size(); } void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params, const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector& data_ptr_regs) const { - const int64_t harness_num_dims = jcp.output_dims.size() - 1; - auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets, Reg64 reg_tmp) { - for (int j = 0; j < harness_num_dims; j++) { - if (jcp.output_dims[j] != 1 && offsets[j] != 0) { + // master_shape size must be valid in both static and dynamic cases + const int64_t offsetRank = jcp.master_shape.size() - 1; + std::function init_ptr_with_offset; + init_ptr_with_offset = [&](Reg64 pointer, size_t offset_start_index, Reg64 reg_tmp) { + const int64_t *offsets = jcp.data_offsets + offset_start_index; + for (int j = 0; j < offsetRank; j++) { + if (jcp.master_shape[j] != 1 && offsets[j] != 0) { h->mov(reg_tmp, offsets[j]); h->imul(reg_tmp, h->ptr[reg_indexes + j * sizeof(size_t)]); h->add(pointer, reg_tmp); } } }; - for (auto i = 0; i < num_params; i++) { + const auto spare_corruptable_gpr = std::find_if(gp_regs_pool.begin(), gp_regs_pool.end(), + [this](size_t reg) { + return reg != reg_indexes_idx && reg != reg_const_params_idx; + }); + const bool last_iter_explicitly = spare_corruptable_gpr == gp_regs_pool.end(); + Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs.back() : Reg64(static_cast(*spare_corruptable_gpr)); + size_t i = 0; + for (; i < num_params - last_iter_explicitly; i++) { if (i < num_inputs) h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]); else h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]); - // we can use the last data_ptr_reg as tmp_reg until the last iteration, and reg_const_params then - Reg64 reg_tmp = i < num_params-1 ? data_ptr_regs.back() : reg_const_params; - init_ptrs_with_offsets(data_ptr_regs[i], &jcp.data_offsets[i * harness_num_dims], reg_tmp); + init_ptr_with_offset(data_ptr_regs[i], i * offsetRank, reg_tmp); + } + // a rare case when num_params is maximal, so we have no spare gprs + // * Static case: we can use reg_const_params as the last reg_tmp for the last iteration (and corrupt it), since + // it won't be used anymore + // * Dynamic case: we will need reg_const_params to pass runtime args to LoopScheduler, so we have to + // push a reg on the stack, and restore it value afterwards + if (last_iter_explicitly) { + h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]); + reg_tmp = reg_const_params; + // can corrupt reg_const_params, since we won't use it anymore + init_ptr_with_offset(data_ptr_regs[i], i * offsetRank, reg_tmp); } } void KernelEmitter::emit_impl(const std::vector& in, const std::vector& out, - const std::vector& allocated_vec_regs, - const std::vector& allocated_gp_regs, + const std::vector& vec_pool, + const std::vector& gpr_pool, const ov::intel_cpu::emitter_context *emit_context) const { h->preamble(); const size_t num_inputs = in[0]; const size_t num_outputs = in[1]; - Reg64 reg_indexes = Reg64(abi_param1.getIdx()); - Reg64 reg_const_params = Reg64(abi_param2.getIdx()); + Reg64 reg_indexes = Reg64(static_cast(reg_indexes_idx)); + Reg64 reg_const_params = Reg64(static_cast(reg_const_params_idx)); std::vector data_ptr_regs; - transform_idxs_to_regs(gp_regs_used, data_ptr_regs); + transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs); init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs); - // todo: emit_impl is a const method, so we can't just push_back unused regs to the gp_regs_pool. - // we need a more elegant approach to avoid a full copy here - auto local_gpr_pool = gp_regs_pool; - local_gpr_pool.push_back(static_cast(reg_indexes.getIdx())); - local_gpr_pool.push_back(static_cast(reg_const_params.getIdx())); for (const auto& c : body) { const auto& emitter = c.first; std::vector in_regs, out_regs; std::tie(in_regs, out_regs) = c.second; - if (auto tile_scheduler = std::dynamic_pointer_cast(emitter)) - out_regs = gp_regs_used; - emitter->emit_code(in_regs, out_regs, vec_regs_pool, local_gpr_pool); + emitter->emit_code(in_regs, out_regs, vec_regs_pool, gp_regs_pool); } h->postamble(); } -TileSchedulerEmitter::TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { - const auto tile_scheduler = ov::as_type_ptr(n); - if (!tile_scheduler) - IE_THROW() << "TileSchedulerEmitter invoked with invalid op argument"; - if (!tile_scheduler->compile_params) - IE_THROW() << "TileEmitter invoked without compile_params"; - body = {tile_scheduler->vector_region, tile_scheduler->scalar_region}; - jcp = *reinterpret_cast(tile_scheduler->compile_params); -} -void TileSchedulerEmitter::emit_code(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { + +LoopBeginEmitter::LoopBeginEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { + loop_begin = ov::as_type_ptr(n); + if (!loop_begin) + IE_THROW() << "LoopBeginEmitter invoked with invalid op argument"; + const auto& target_inputs = loop_begin->output(loop_begin->get_output_size() - 1).get_target_inputs(); + // todo: this check could be excessive, since we check for it in validate_and_infer_types() + if (target_inputs.size() != 1) + IE_THROW() << "LoopBeginEmitter invoked with invalid configuration: the last output must have exactly one input attached"; + const auto loop_end = ov::as_type_ptr(target_inputs.begin()->get_node()->shared_from_this()); + if (!loop_end) + IE_THROW() << "LoopBeginEmitter invoked with invalid configuration: the last output must be LoopEnd"; + work_amount = loop_begin->get_work_amount(); + evaluate_once = loop_begin->get_evaluate_once(); + num_inputs = loop_begin->get_input_size(); + in_out_type_ = emitter_in_out_map::gpr_to_gpr; +} + +void LoopBeginEmitter::emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { validate_arguments(in, out, pool, gpr); emit_impl(in, out, pool, gpr, nullptr); } -void TileSchedulerEmitter::validate_arguments(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { - if (in.size() != 3) - IE_THROW() << "TileSchedulerEmitter got invalid number of inputs. Expected 3, got " << in.size(); - if (out.size() != in[0] + in[1]) - IE_THROW() << "TileSchedulerEmitter got invalid number of outputs. Expected " << in[0] + in[1] << " , got " << out.size(); - if (body.size() != 2) - IE_THROW() << "TileSchedulerEmitter got invalid body size, expected 2 (vector & scalar TileEmitter), got " << body.size(); - if (!(std::dynamic_pointer_cast(body[0].first) && std::dynamic_pointer_cast(body[1].first))) - IE_THROW() << "TileSchedulerEmitter can contain only TileEmitters inside its body"; -} - -void TileSchedulerEmitter::emit_tiles(const Reg64& reg_inner_amount, const std::vector& data_ptr_regs, size_t vector_size, - const std::vector& vec_pool, const std::vector& gpr_pool) const { - // TileAllocatedEmitter is just an alias to perform dynamic_pointer_cast only once and reuse it below several times - using TileAllocatedEmitter = std::pair, const ngraph::snippets::RegInfo&>; - TileAllocatedEmitter vector_tile {std::dynamic_pointer_cast(body[0].first), body[0].second}; - TileAllocatedEmitter scalar_tile {std::dynamic_pointer_cast(body[1].first), body[1].second}; - const size_t inner_work_amount = jcp.scheduler_dims[1]; - auto process_tile = - [&](const bool evaluate_once, const TileAllocatedEmitter& tile) { - // If Tile is evaluated only once, then we can emit its body directly and skip work_amount decrements and checks - if (evaluate_once) { - tile.first->emit_body(vec_pool, gpr_pool); - } else { - std::vector in_regs, out_regs; - std::tie(in_regs, out_regs) = tile.second; - // pass work_amount reg to Tile - in_regs.push_back(static_cast(reg_inner_amount.getIdx())); - for (const auto& reg : data_ptr_regs) - out_regs.emplace_back(reg.getIdx()); - tile.first->emit_code(in_regs, out_regs, vec_pool, gpr_pool); - } - }; - // todo: these optimizations should be performed on using Tile graph representation in the future - bool vector_evaluate_once = false; - if (inner_work_amount >= vector_size) { - vector_evaluate_once = inner_work_amount < 2 * vector_size; - // Need to set proper work amount for inner tiles if evaluated multiple times - if (!vector_evaluate_once) - h->mov(reg_inner_amount, inner_work_amount); - process_tile(vector_evaluate_once, vector_tile); - } - if (inner_work_amount % vector_size >= 1) { - bool scalar_evaluate_once = inner_work_amount % vector_size < 2; - if (!scalar_evaluate_once) { - // vector_tile is not executed, work_amount is not set - if (inner_work_amount < vector_size) { - h->mov(reg_inner_amount, inner_work_amount); - // vector_tile is executed, but work_amount is neither set nor decremented appropriately. - } else if (vector_evaluate_once) { - vector_tile.first->emit_ptr_increments(data_ptr_regs); - h->mov(reg_inner_amount, inner_work_amount - vector_size); - } - // else: vector_tile is executed multiple times, so work_amount is already set - } else { - if (vector_evaluate_once) { - vector_tile.first->emit_ptr_increments(data_ptr_regs); - } - } - process_tile(scalar_evaluate_once, scalar_tile); - } + +void LoopBeginEmitter::validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + if (in.size() != num_inputs) + IE_THROW() << "Invalid inputs size: expected " << num_inputs << " got " << in.size(); + if (out.size() != num_inputs + 1) + IE_THROW() << "Invalid outputs size: expected " << num_inputs + 1 << " got " << out.size(); } -void TileSchedulerEmitter::emit_impl(const std::vector& in, - const std::vector& out, - const std::vector& vec_pool, - const std::vector& gpr_pool, - const ov::intel_cpu::emitter_context *emit_context) const { - const size_t num_inputs = in[0]; - const size_t num_outputs = in[1]; - const size_t vector_size = in[2]; - const size_t num_params = num_inputs + num_outputs; - const auto& data_ptr_reg_idxs(out); - std::vector data_ptr_regs; - transform_idxs_to_regs(data_ptr_reg_idxs, data_ptr_regs); - // todo: emit_impl has const input args, so we can't just pop_back necessary regs from gpr_pool. - // we need a more elegant approach to avoid a full copy here. Similar problem is demonstrated in KernelEmitter - auto local_gpr_pool = gpr_pool; - Reg64 reg_outer_amount = Reg64(static_cast(local_gpr_pool.back())); - local_gpr_pool.pop_back(); - Reg64 reg_inner_amount = Reg64(static_cast(local_gpr_pool.back())); - local_gpr_pool.pop_back(); +void LoopBeginEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + // todo: In dynamic case we will also need to set broadcasting info here + Reg64 reg_work_amount = Reg64(out.back()); Label for_body; - const size_t outer_work_amount = jcp.scheduler_dims[0]; - if (outer_work_amount == 1) { - // emit code directly without looping over external dim - emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool); - } else if (outer_work_amount > 1) { - // We need to create a Loop in this case - h->mov(reg_outer_amount, outer_work_amount); - h->L(for_body); - { - emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool); - - // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers - // after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes). - // To overcome this limitation, we add appropriate negative offsets if necessary. - for (auto i = 0; i < num_params; i++) { - if (jcp.scheduler_offsets[i] != 0) { - h->add(data_ptr_regs[i], jcp.scheduler_offsets[i]); - } - } - // Note that outer dimensions are always incremented by 1 (outer tiles are always scalar) - h->sub(reg_outer_amount, 1); - h->cmp(reg_outer_amount, 1); - h->jge(for_body, CodeGenerator::T_NEAR); - } + // save previous register state (if there is an outer loop that uses this reg for example) + if (!evaluate_once) { + h->mov(reg_work_amount, work_amount); } + // Note: loop address is not calculated at this point, so need to call calcJmpAddress() which is protected + // or ready(), but they both set internal flags and that's not a desired way to use them. + // So the most obvious WA is just to use current address manually + loop_begin->begin_address = h->getCurr(); + loop_begin->input_regs = in; +} + +LoopEndEmitter::LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, + const std::shared_ptr& n) : jit_emitter(h, isa, n) { + loop_end = ov::as_type_ptr(n); + if (!loop_end) + IE_THROW() << "LoopEndEmitter invoked with invalid op argument"; + loop_begin = loop_end->get_loop_begin(); + // todo: this check could be excessive, since we check for it in validate_and_infer_types() + if (!loop_begin) + IE_THROW() << "LoopEndEmitter invoked with invalid configuration: the last arg must be LoopBegin"; + // Note that 1 edge connects LoopBegin and LoopEnd + num_inputs = loop_begin->get_input_size(); + num_outputs = loop_end->get_output_size(); + increment = loop_end->get_increment(); + work_amount = loop_end->get_work_amount(); + apply_increments = loop_end->get_apply_increment(); + finalization_offsets = loop_end->get_finalization_offsets(); + evaluate_once = loop_end->get_evaluate_once(); + for (int i = 0; i < num_inputs; i++) + io_data_size.push_back(loop_begin->get_input_element_type(i).size()); + for (int i = 0; i < num_outputs; i++) + io_data_size.push_back(loop_end->get_input_element_type(i).size()); + in_out_type_ = emitter_in_out_map::gpr_to_gpr; } -std::vector& TileEmitter::get_nested_code() { - return body; -} - -TileEmitter::TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n) : jit_container_emitter(h, isa, n) { - const auto tile = ov::as_type_ptr(n); - if (!tile) - IE_THROW() << "TileEmitter invoked with invalid op argument"; - body = tile->region; - if (body.empty()) - IE_THROW() << "TileEmitter is invoked with empty body"; - num_inputs = tile->num_inputs; - num_outputs = tile->num_outputs; - io_dims = tile->io_dims; - io_data_size = tile->io_data_size; - increment = tile->increment; - if (io_dims.size() != num_inputs + num_outputs) - IE_THROW() << "TileEmitter constructor got inconsistent arguments. Check num_inputs + num_outputs == io_dims.size()"; -} - -void TileEmitter::emit_code(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { +void LoopEndEmitter::emit_code(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { validate_arguments(in, out, pool, gpr); emit_impl(in, out, pool, gpr, nullptr); } -void TileEmitter::validate_arguments(const std::vector &in, - const std::vector &out, - const std::vector &pool, - const std::vector &gpr) const { - if (in.size() != 1) - IE_THROW() << "TileEmitter got invalid number of inputs. Expected 1, got " << in.size(); - if (out.size() != io_dims.size()) - IE_THROW() << "TileEmitter got invalid number of outputs. Expected " << io_dims.size() << " , got " << out.size(); -} -void TileEmitter::emit_body(const std::vector& vec_pool, const std::vector& gpr_pool) const { - for (auto& code : body) - code.first->emit_code(code.second.first, code.second.second, vec_pool, gpr_pool); -} - -void TileEmitter::emit_ptr_increments(const std::vector& data_ptr_regs) const { - for (size_t i = 0; i < num_inputs + num_outputs; i++) { - // those with dims == 1 will be broadcasted, hence don't require increment - if (io_dims[i] != 1) - h->add(data_ptr_regs[i], increment * io_data_size[i]); +void LoopEndEmitter::validate_arguments(const std::vector &in, + const std::vector &out, + const std::vector &pool, + const std::vector &gpr) const { + if (loop_begin->input_regs.size() != num_inputs) + IE_THROW() << "Invalid loop_begin->input_regs size: expected " << num_inputs << " got " << loop_begin->input_regs.size(); + if (out.size() != num_outputs) + IE_THROW() << "Invalid number of out arguments: expected " << num_outputs << " got " << out.size(); + if (in.size() != num_outputs + 1) + IE_THROW() << "Invalid number of in arguments: expected " << num_inputs + 1 << " got " << in.size(); + const auto io_size = num_inputs + num_outputs; + if (apply_increments.size() != io_size) + IE_THROW() << "Invalid apply_increments size: expected " << io_size << " got " << apply_increments.size(); + if (finalization_offsets.size() != io_size) + IE_THROW() << "Invalid finalization_offsets size: expected: " << io_size << " got " << finalization_offsets.size(); +} + +void LoopEndEmitter::emit_impl(const std::vector& in, + const std::vector& out, + const std::vector& pool, + const std::vector& gpr, + const ov::intel_cpu::emitter_context *emit_context) const { + std::vector data_ptr_reg_idxs(loop_begin->input_regs); + data_ptr_reg_idxs.reserve(num_inputs + num_outputs); + std::copy(out.begin(), out.end(), std::back_inserter(data_ptr_reg_idxs)); + std::vector data_ptr_regs; + transform_idxs_to_regs(data_ptr_reg_idxs, data_ptr_regs); + Reg64 reg_work_amount = Reg64(in.back()); + if (!evaluate_once) { + for (int idx = 0; idx < data_ptr_regs.size(); idx++) { + if (apply_increments[idx]) + h->add(data_ptr_regs[idx], increment * io_data_size[idx]); + } + h->sub(reg_work_amount, increment); + h->cmp(reg_work_amount, increment); + h->jge(loop_begin->begin_address); } -} -void TileEmitter::emit_impl(const std::vector& in, - const std::vector& out, - const std::vector& vec_pool, - const std::vector& gpr_pool, - const ov::intel_cpu::emitter_context *emit_context) const { - Reg64 work_amount = Reg64(static_cast(in[0])); - std::vector data_ptr_regs; - transform_idxs_to_regs(out, data_ptr_regs); - Label for_body; - // Note that: - // * Work amount must be set by TileScheduler that executes Tiles - // * TileScheduler executes Tile only if it has to perform >= 1 iterations - h->L(for_body); - emit_body(vec_pool, gpr_pool); - emit_ptr_increments(data_ptr_regs); - h->sub(work_amount, increment); - h->cmp(work_amount, increment); - h->jge(for_body, CodeGenerator::T_NEAR); + for (int idx = 0; idx < data_ptr_regs.size(); idx++) { + if (finalization_offsets[idx] != 0) + h->add(data_ptr_regs[idx], finalization_offsets[idx] * io_data_size[idx]); + } } BroadcastMoveEmitter::BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp index adfd88dfeddff6..7fa1b8f1aa958d 100644 --- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp @@ -23,6 +23,7 @@ namespace intel_cpu { #define SNIPPETS_MAX_SNIPPETS_DIMS 12 #define SNIPPETS_MAX_HARNESS_DIMS 5 #define SNIPPETS_MAX_TILE_RANK 2 +#define SNIPPETS_DYNAMIC_MASTER_SHAPE_RANK 6 #define GET_OFF(field) offsetof(jit_snippets_call_args, field) struct jit_snippets_call_args { const void *src_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; @@ -30,40 +31,40 @@ struct jit_snippets_call_args { }; struct jit_snippets_compile_args { - int64_t scheduler_dims[SNIPPETS_MAX_TILE_RANK] = {}; - int64_t scheduler_offsets[SNIPPETS_MAX_SNIPPETS_DIMS] = {}; + std::vector master_shape{}; int64_t data_offsets[SNIPPETS_MAX_SNIPPETS_DIMS * SNIPPETS_MAX_HARNESS_DIMS] = {}; - std::vector output_dims = {}; }; /// -/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (presently KernelEmitter, -/// TileSchedulerEmitter and TileEmitter). This is needed to provide common interface for register mapping +/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (for example, KernelEmitter) +/// This is needed to provide common interface for register mapping /// (abstract to physical) and nested code access. /// class jit_container_emitter: public jit_emitter { public: jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); + // mapping info contains abstract_to_physical map + regs_pool + using mapping_info = std::pair, std::vector&>; protected: // maps gpr and vec abstract registers to physical ones. Physical reg indexes are taken from the provided pools // (the first 2 args). All the used gpr and vec registers are also stored in the provided sets (the second 2 args). - void map_abstract_registers(const std::vector&, const std::vector&, - std::set&, std::set&); + void map_abstract_registers(mapping_info& gpr_map_pool, mapping_info& vec_map_pool, + std::vector& allocated_emitters) const; std::vector body; }; /// /// \brief Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register -/// mapping and creates pools of available gpr and vec registers. Kernel is expected to contain (at least one) -/// TileSchedulerEmitter. In general the enclosed emitters should be organized in the following way: -/// KernelEmitter { /* entry point, maps registers, creates pools of available registers */ -/// TileSchedulerEmitter { /* executes required inner, avoids emitting code that won't be executed */ -/// TileEmitter { /* inner vector tile */ -/// ... /* All the necessary Load/Strore/elementwise emitters */ -/// } -/// TileEmitter { /* inner scalar tile for tail processing */ -/// ... /* All the necessary Load/Strore/elementwise emitters */ -/// } -/// } +/// mapping and creates pools of available gpr and vec registers. Kernel usually to contains (at least one) +/// LoopBeginEmitter and LoopEndEmitter pair. In general the enclosed emitters should be organized in the following way: +/// KernelEmitter { /* entry point, maps registers, creates pools of available registers */ +/// 1.S LoopBeginEmitter /* Scalar Loop over the outer dimension [START] */ +/// 2.S LoopBeginEmitter /* inner vector loop [START] */ +/// ... /* All the necessary Load/Strore/elementwise emitters */ +/// 2.E LoopEndEmitter /* inner vector loop [END] */ +/// 3.S LoopBeginEmitter /* inner scalar loop for tail processing [START]*/ +/// ... /* All the necessary Load/Strore/elementwise emitters */ +/// 3.E LoopEndEmitter /* inner scalar loop for tail processing [END]*/ +/// 1.E LoopEndEmitter /* Scalar Loop over the outer dimension [END] */ /// } /// Note that Kernel doesn't accept any input arguments. /// @@ -92,29 +93,22 @@ class KernelEmitter : public jit_container_emitter { jit_snippets_compile_args jcp; std::vector gp_regs_pool; - std::vector gp_regs_used; + // gpr's used to store data pointers, track them to apply offsets in Kernel + std::vector data_ptr_regs_idx; std::vector vec_regs_pool; + const size_t reg_indexes_idx = abi_param1.getIdx(); + const size_t reg_const_params_idx = abi_param2.getIdx(); }; -/// -/// \brief TileSchedulerEmitter contains Tiles to be executed (presently vector and scalar). It calculates data offsets -/// and work amounts, performs data pointer decrements if necessary. It also performs some Tile optimizations: scalar/vector -/// tiles are emitted only if necessary; Tile body could be emitted directly, if only one Tile evaluation is required. -/// -/// \param in[0] The number of the node inputs -/// \param in[1] The number of the node outputs -/// \param in[2] The number of elements that fits into vector register -/// -class TileSchedulerEmitter : public jit_container_emitter { +class LoopBeginEmitter : public jit_emitter { public: - TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, - const std::shared_ptr& n); - - size_t get_inputs_num() const override {return 0;} + LoopBeginEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); void emit_code(const std::vector &in, const std::vector &out, const std::vector &pool, const std::vector &gpr) const override; + // todo: it is purely virtual in the base class, but do we need it? + size_t get_inputs_num() const override {return 0;} private: void validate_arguments(const std::vector &in, @@ -127,50 +121,48 @@ class TileSchedulerEmitter : public jit_container_emitter { const std::vector& gpr, const ov::intel_cpu::emitter_context *emit_context) const override; - void emit_tiles(const Reg64&, const std::vector&, size_t, const std::vector& , const std::vector&) const; - - jit_snippets_compile_args jcp; + std::shared_ptr loop_begin; + size_t num_inputs = 0; + bool evaluate_once = false; + size_t work_amount = 0; // need to store work_amount explicitly, since two loops can work on the same dim (e.g. vector + scalar) }; -/// -/// \brief Tile is designed to organize loop over the input and output data. It is essentially a for(...) loop: -/// it performs operations specified by enclosed emitters, advances iteration counters -/// and breaks when necessary. -/// -/// \param in[0] The number of input entities (or scheduler counts) processed during one iteration of the tile. -/// It is expected to be 1 for outer or scalar tiles and vlen for vector tiles. -class TileEmitter : public jit_container_emitter { +class LoopEndEmitter : public jit_emitter { public: - TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); - - size_t get_inputs_num() const override {return 0;} - std::vector& get_nested_code(); + LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n); void emit_code(const std::vector &in, const std::vector &out, const std::vector &pool, const std::vector &gpr) const override; - - void emit_body(const std::vector& vec_pool, const std::vector& gpr_pool) const; - void emit_ptr_increments(const std::vector& data_ptr_regs) const; + // todo: it is purely virtual in the base class, but do we need it? + size_t get_inputs_num() const override {return 0;} private: void validate_arguments(const std::vector &in, const std::vector &out, const std::vector &pool, const std::vector &gpr) const override; + void emit_impl(const std::vector& in, const std::vector& out, const std::vector& pool, const std::vector& gpr, const ov::intel_cpu::emitter_context *emit_context) const override; + std::shared_ptr loop_begin; + std::shared_ptr loop_end; + size_t num_inputs = 0; size_t num_outputs = 0; - std::vector io_dims {}; std::vector io_data_size {}; size_t increment = 0; + size_t work_amount = 0; + bool evaluate_once = false; + std::vector apply_increments; + std::vector finalization_offsets; }; + class NopEmitter : public jit_emitter { public: NopEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr& n) @@ -205,7 +197,6 @@ class BroadcastMoveEmitter : public jit_emitter { void emit_isa(const std::vector &in, const std::vector &out) const; private: - bool use_broadcast; size_t byte_size = 0lu; }; @@ -239,7 +230,7 @@ class ScalarEmitter : public jit_emitter { /// it's illigal to load/store to the same address multiple times /// Typical application can be if Load and BroadcastLoad are performed from the same pointer. /// If Load goes before BroadcastLoad topologicaly the resilt will be incorrect -/// For scalar loads we can use different tiles. Tiling indeed can be arbitrary and post increment should be somehow coded into ISA. +/// For scalar loads we can use different loops. Tiling indeed can be arbitrary and post increment should be somehow coded into ISA. /// Blocked parameter to tell if input is actually blocked. Broadcast means broadcast by W in other cases no need to substitute load. class MemoryEmitter : public jit_emitter { public: @@ -354,6 +345,5 @@ class StoreConvertEmitter : public MemoryEmitter { size_t count; std::unique_ptr store_emitter = nullptr; }; - } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp index 5e4fda63190a5a..33d0e2e61fec60 100644 --- a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp +++ b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp @@ -81,7 +81,7 @@ bool canBePerformedAsScaleShift(const std::shared_ptr &node, const i fusingPort = i; dataShape = node->get_input_partial_shape(i); // only one non-const parent is allowed - if (dataShape.is_dynamic() || ++numNonConstInputs != 1) + if (++numNonConstInputs != 1) return false; } else { // every const parent must have exactly one child @@ -97,8 +97,7 @@ bool canBePerformedAsScaleShift(const std::shared_ptr &node, const i if (i == fusingPort) continue; const ov::PartialShape weightShape = node->get_input_partial_shape(i); - if (weightShape.is_dynamic() || - !isPerTensorOrPerChannelBroadcastable(dataShape.get_shape(), weightShape.get_shape(), channelAxis, true)) + if (!isPerTensorOrPerChannelBroadcastable(dataShape.get_max_shape(), weightShape.get_max_shape(), channelAxis, true)) return false; } return true; @@ -250,22 +249,20 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr &node, con NodeFusingType &updatedChainType, int& fusingAxis) { int num_non_const_inputs = 0; bool can_be_converted_to_FC = false; - ov::Shape bias_shape; - ov::Shape matmul_shape; + ov::PartialShape bias_shape; + ov::PartialShape matmul_shape; for (const auto &parent_out : node->input_values()) { const auto parent = parent_out.get_node_shared_ptr(); if (ngraph::op::is_constant(parent)) { bias_shape = parent_out.get_shape(); num_non_const_inputs++; } else { - const auto pshape = parent_out.get_partial_shape(); - if (pshape.is_dynamic() || pshape.get_shape().empty()) + matmul_shape = parent_out.get_partial_shape(); + if (matmul_shape.size() == 0) return false; - matmul_shape = pshape.get_shape(); const auto& grandparents = parent->input_values(); // first check that weights are constant and both activations and weights have static shape if (grandparents.size() == 2 && - grandparents[0].get_partial_shape().is_static() && grandparents[1].get_partial_shape().is_static() && ov::is_type(grandparents[1].get_node_shared_ptr())) { auto rank_a = grandparents[0].get_partial_shape().rank().get_length(); @@ -280,8 +277,9 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr &node, con // Matmul / FC bias fusion if (ov::is_type(node) && - bias_shape.back() == matmul_shape.back() && - bias_shape.back() == shape_size(bias_shape)) { + bias_shape.rbegin()->get_length() == matmul_shape.rbegin()->get_length() && + bias_shape.is_static() && + bias_shape.rbegin()->get_length() == shape_size(bias_shape.get_shape())) { return true; } @@ -431,7 +429,7 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr &m) { RUN_ON_MODEL_SCOPE(SnippetsMarkSkipped); int channelAxis = DEFAULT_AXIS; for (auto &node : m->get_ordered_ops()) { - if (ngraph::op::is_constant(node)) + if (ngraph::op::is_constant(node) || ov::is_type(node)) continue; if (ngraph::op::is_parameter(node)) { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 503d989492f5e3..7ad5ebf1636d1f 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -22,6 +22,7 @@ #include #include "emitters/cpu_generator.hpp" +#include "utils/cpu_utils.hpp" #include "snippets_transformations/fuse_load_store_and_convert.hpp" #include "ngraph_transformations/convert_to_swish_cpu.hpp" @@ -67,6 +68,7 @@ void Snippet::copy_snippet() { ngraph::copy_runtime_info(original_snippet, snippet); snippet->set_friendly_name(original_snippet->get_friendly_name()); snippet->set_generator(std::make_shared(host_isa)); + isa_num_lanes = snippet->get_generator()->get_target_machine()->get_lanes(); } void Snippet::initSupportedPrimitiveDescriptors() { @@ -89,7 +91,13 @@ void Snippet::initSupportedPrimitiveDescriptors() { // Todo: Snippets currently don't support per-channel broadcasting of Blocked descriptors because // canonicalization can't distinguish between and cases. // See snippets::op::Subgraph::canonicalize for details. - const bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims, 4, 5) && dimRanksAreEqual; + bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims, 4, 5) && dimRanksAreEqual; + + for (const auto& inShape : inputShapes) { + if (isDynamic && inShape.getRank() != 1) + isBlockedApplicable = isBlockedApplicable && inShape.getMinDims()[1] != Shape::UNDEFINED_DIM && inShape.getMinDims()[1] > 1; + } + enum LayoutType { Planar, ChannelsFirst, @@ -192,42 +200,6 @@ void Snippet::initSupportedPrimitiveDescriptors() { void Snippet::selectOptimalPrimitiveDescriptor() { selectPreferPrimitiveDescriptor(getPrimitivesPriority(), true); } - -void Snippet::createPrimitive() { - // schedule definition part - // it defines offsets, strides and sizes for snippet kernel scheduling - define_schedule(); - - // code generation part - // it might be worth to generate explicitly for scheduler work amount for now, - // but in future some interface should be defined in order to communicate schedule for a kernel - // or generate schedule for a kernel. - // Here kernel is generated for most warying dimension by default. - generate(); -} - -void Snippet::execute(dnnl::stream strm) { - if (schedule.ptr == nullptr || !canUseOptimizedImpl) { - IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference"; - } - jit_snippets_call_args call_args; - for (size_t i = 0; i < srcMemPtrs.size(); i++) - call_args.src_ptrs[i] = reinterpret_cast(srcMemPtrs[i]->GetData()) + start_offset_in[i]; - - for (size_t i = 0; i < dstMemPtrs.size(); i++) - call_args.dst_ptrs[i] = reinterpret_cast(dstMemPtrs[i]->GetData()) + start_offset_out[i]; - - if (tensorRank == rank6D) { - schedule_6d(call_args); - } else { - schedule_nt(call_args); - } -} - -bool Snippet::created() const { - return getType() == Type::Subgraph; -} - InferenceEngine::Precision Snippet::getRuntimePrecision() const { std::vector inputPrecisions; for (size_t i = 0; i < getParentEdges().size(); i++) { @@ -240,264 +212,343 @@ InferenceEngine::Precision Snippet::getRuntimePrecision() const { return getMaxPrecision(inputPrecisions); } -bool Snippet::canBeInPlace() const { - if (getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) { - return false; - } - - if (getChildEdges().size() != 1) { - return false; - } - - for (auto& parentEdge : getParentEdges()) { - auto parent = parentEdge.lock()->getParent(); - if (parent->getChildEdges().size() != 1) - return false; - - // WA to prevent memory corruption caused by inplace feature - if (parent->getType() == Type::Concatenation) { - for (auto& parentParentEdge : parent->getParentEdges()) { - auto parentParent = parentParentEdge.lock()->getParent(); - if (parentParent->getChildEdges().size() != 1) - return false; - } +void Snippet::calcJITParams(std::vector& offsets) const { + const size_t numInputs = normInputShapes.size(); + const size_t numParams = numInputs + normOutputShapes.size(); + + // Note that we don't need offset for the last dim, since it's handled directly by Tile emitter + const size_t offset_rank = masterShape.size() - 1; + offsets.resize(numParams * (offset_rank), 1); + auto offset_calculation = [offset_rank, this](int64_t *off, const std::vector& dims, const size_t data_size) { + size_t k = dims.back(); + for (int i = offset_rank - 1; i >= 0; i--) { + auto tmp = (dims[i] == masterShape[i] && masterShape[i] != 1) ? k : 0; + off[i] = tmp * data_size; + k *= dims[i]; } - } - return getInputShapeAtPort(0) == getOutputShapeAtPort(0); -} - -static void offset_calculation(std::vector& offset, const std::vector& dims_in, const std::vector& dims_out) { - size_t k = 1; - for (int i = offset.size() - 1; i >= 0; i--) { - offset[i] = (dims_in[i] == dims_out[i]) ? k : 0; - k *= dims_in[i]; - } -} - -static auto collapseLastDims(std::vector& dims, size_t dimsToCollapse) -> void { - if (dimsToCollapse >= dims.size() - 1) - IE_THROW() << "Got invalid number of dims to collapse. Expected < " << dims.size() - 1 << " got " << dimsToCollapse; - for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { - dims[dims.size() - 1] *= dims[i]; - } - - for (int i = dims.size() - 2; i >= dimsToCollapse; i--) { - dims[i] = dims[i - dimsToCollapse]; - } - - for (int i = dimsToCollapse - 1; i >= 0; i--) { - dims[i] = 1; - } -} - -void Snippet::define_schedule() { - auto edgeToBlockedShape = [](const EdgePtr& edge) { - const auto blockedDesc = edge->getMemory().GetDescWithType(); - ngraph::Shape shape(blockedDesc->getBlockDims()); - ngraph::AxisVector blocking(blockedDesc->getOrder()); - ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision()); - return ngraph::snippets::op::Subgraph::BlockedShape{shape, blocking, precision}; - }; - auto prependWithOnes = [this](const std::vector& dims) { - if (tensorRank <= dims.size()) - return dims; - VectorDims result(tensorRank, 1); - std::copy(dims.begin(), dims.end(), &result[tensorRank - dims.size()]); - return result; }; - ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes; - for (size_t i = 0; i < inputShapes.size(); i++) - input_blocked_shapes.push_back(edgeToBlockedShape(getParentEdgesAtPort(i)[0])); - - ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes; - for (size_t i = 0; i < outputShapes.size(); i++) - output_blocked_shapes.push_back(edgeToBlockedShape(getChildEdgesAtPort(i)[0])); - - exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes); - - // initialize by maximum output dimension. Dimensions of outputs should be broadcastable - tensorRank = std::max(static_cast(rank6D), exec_domain.size()); - // Canonicalization broadcasts inputs and outputs to max input rank, which can be smaller than tensorRank - // prepend to enable 6D scheduler - exec_domain = prependWithOnes(exec_domain); - const auto &body = snippet->get_body(); - for (const auto& p : body->get_parameters()) { - dims_in.emplace_back(prependWithOnes(p->get_shape())); - } - - for (size_t i = 0; i < body->get_output_size(); i++) { - dims_out.push_back(prependWithOnes(body->get_output_shape(i))); + for (size_t i = 0; i < numParams; i++) { + offset_calculation(&offsets[i * offset_rank], + i < numInputs ? normInputShapes[i] : normOutputShapes[i - numInputs], + dataSize[i]); } - - const auto config = getSelectedPrimitiveDescriptor()->getConfig(); - auto initOffsets = [this, config]() { - // find max rank input among all outputs - const size_t inputNum = getParentEdges().size(); - offsets_in.resize(inputNum); - for (size_t i = 0; i < inputNum; i++) { - offsets_in[i].resize(tensorRank, 1); - offset_calculation(offsets_in[i], dims_in[i], exec_domain); - for (size_t j = 0; j < tensorRank; j++) { - offsets_in[i][j] *= config.inConfs[i].getMemDesc()->getPrecision().size(); +} +void Snippet::optimizeExecDomain(std::vector& inputShapes, std::vector& outputShapes, + VectorDims &domain, size_t& TileRank) const { + const size_t minimalConcurrency = parallel_get_max_threads(); + const size_t minimalJitWorkAmount = 256; + const size_t ds = domain.size(); + if ( ds <= 2 || // not enough dimensions to collapse + domain[ds-1] >= minimalJitWorkAmount || // There is enough work for 1D Tiles, no need to collapse + domain[ds-1] * domain[ds-2] >= fullWorkAmount / minimalConcurrency) // There won't be enough work for every thread (even one iter) if we collapse + return; + auto findDimsToCollapse = [&]() { + auto collapseLastDims = [](VectorDims& dims, size_t dimsToCollapse) { + if (dimsToCollapse >= dims.size() - 1) + IE_THROW() << "Got invalid number of dims to collapse. Expected < " << dims.size() - 1 << " got " << dimsToCollapse; + for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) { + dims[dims.size() - 1] *= dims[i]; } - } - start_offset_in.resize(inputNum); - srcMemPtrs.resize(inputNum); - for (size_t i = 0; i < inputNum; i++) { - const auto memPtr = getParentEdgeAt(i)->getMemoryPtr(); - srcMemPtrs[i] = memPtr; - start_offset_in[i] = memPtr->GetDescWithType()->getOffsetPadding() * - config.inConfs[i].getMemDesc()->getPrecision().size(); - } - - const size_t outputNum = config.outConfs.size(); - offsets_out.resize(outputNum); - for (size_t i = 0; i < outputNum; i++) { - offsets_out[i].resize(tensorRank, 1); - offset_calculation(offsets_out[i], dims_out[i], exec_domain); - for (size_t j = 0; j < tensorRank; j++) { - offsets_out[i][j] *= config.outConfs[i].getMemDesc()->getPrecision().size(); + for (int i = dims.size() - 2; i >= dimsToCollapse; i--) { + dims[i] = dims[i - dimsToCollapse]; } - } - start_offset_out.resize(outputNum); - dstMemPtrs.resize(outputNum); - for (size_t i = 0; i < outputNum; i++) { - const auto memPtr = getChildEdgeAt(i)->getMemoryPtr(); - dstMemPtrs[i] = memPtr; - start_offset_out[i] = memPtr->GetDescWithType()->getOffsetPadding() * - config.outConfs[i].getMemDesc()->getPrecision().size(); - } - }; - - auto find_dims_to_collapse = [this, config]() -> int { + for (int i = dimsToCollapse - 1; i >= 0; i--) { + dims[i] = 1; + } + }; int collapsedDims = 0; - size_t minimalConcurrency = parallel_get_max_threads(); - size_t minimalJitWorkAmount = 256; - size_t currentJitWorkAmount = exec_domain.back(); + size_t currentJitWorkAmount = domain[domain.size() - 1]; while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount) { - if (static_cast(exec_domain.size()) - collapsedDims - 2 < 0) + if (static_cast(domain.size()) - collapsedDims - 2 < 0) break; bool canCollapse = true; - for (size_t i = 0; i < dims_in.size(); i++) { - if ((dims_in[i][dims_in[i].size() - 2] != 1 && dims_in[i][dims_in[i].size() - 1] == 1) || - (dims_in[i][dims_in[i].size() - 2] == 1 && dims_in[i][dims_in[i].size() - 1] != 1)) { + for (size_t i = 0; i < inputShapes.size(); i++) { + const size_t last = inputShapes[i].size() - 1; + if ((inputShapes[i][last - 1] != 1 && inputShapes[i][last] == 1) || + (inputShapes[i][last - 1] == 1 && inputShapes[i][last] != 1)) { canCollapse = false; break; } } - size_t nextJitWorkAmount = currentJitWorkAmount * exec_domain[exec_domain.size() - 2]; + size_t nextJitWorkAmount = currentJitWorkAmount * domain[domain.size() - 2]; if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) { currentJitWorkAmount = nextJitWorkAmount; // if we cannot use dim collapsing we should use tile2D if (!canCollapse) { - if (tileRank < maxTileRank) { - tileRank++; + if (TileRank < maxTileRank) { + TileRank++; continue; } break; } - collapsedDims++; - for (auto &d : dims_in) + for (auto &d : inputShapes) collapseLastDims(d, 1); - - for (auto &d : dims_out) + for (auto &d : outputShapes) collapseLastDims(d, 1); - - collapseLastDims(exec_domain, 1); + collapseLastDims(domain, 1); } else { break; } } - return collapsedDims; + return domain; + }; + findDimsToCollapse(); +} +ov::PartialShape Snippet::canonicalizeBody() { + auto edgeToBlockedShape = [](const EdgePtr& edge) { + const auto blockedDesc = edge->getMemory().GetDescWithType(); + std::vector dims; + // if blockDim == Shape::UNDEFINED_DIM, then it's a dynamic dimension, and we need to recreate a proper dynamic Dim + for (const auto& d : blockedDesc->getBlockDims()) + dims.emplace_back(d == Shape::UNDEFINED_DIM ? -1 : d); + ngraph::PartialShape shape(dims); + ngraph::AxisVector blocking(blockedDesc->getOrder()); + ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision()); + return ngraph::snippets::op::Subgraph::BlockedShape{shape, blocking, precision}; }; + inputShapeIsBlocked.resize(inputShapes.size(), false); + masterShapeIsBlocked = false; + ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes; + for (size_t i = 0; i < inputShapes.size(); i++) { + auto blockedShape = edgeToBlockedShape(getParentEdgesAtPort(i)[0]); + inputShapeIsBlocked[i] = std::get<0>(blockedShape).size() != std::get<1>(blockedShape).size(); + masterShapeIsBlocked = masterShapeIsBlocked || inputShapeIsBlocked[i]; + input_blocked_shapes.push_back(blockedShape); + } - auto initSchedulingInfo = [this, config]() -> void { - // initialize scheduling information - sch_offsets_in.resize(offsets_in.size(), 0); - sch_offsets_out.resize(offsets_out.size(), 0); - sch_dims.resize(maxTileRank, 1); - sch_dims[maxTileRank-1] = exec_domain.back(); - schedulerWorkAmount = fullWorkAmount / exec_domain.back(); - if (tileRank > 1) { - sch_dims[maxTileRank - tileRank] = exec_domain[tensorRank - 2]; - schedulerWorkAmount /= exec_domain[tensorRank - 2]; - exec_domain[tensorRank - 2] = 1; - - // update offsets for tile 2D because loaders and stores have ptr shifts in some cases - const int64_t vector_size = snippet->get_generator()->get_target_machine()->get_lanes(); - for (size_t i = 0; i < offsets_in.size(); i++) { - const int64_t offset = offsets_in[i][tensorRank - 2]; - const int64_t data_size = config.inConfs[i].getMemDesc()->getPrecision().size(); - if (offset == data_size || offset == vector_size * data_size) { - sch_offsets_in[i] = offset; - } else if ((offset > data_size) || (offset == 0 && dims_in[i].back() != 1 && dims_in[i].back() != vector_size)) { - sch_offsets_in[i] = offset - exec_domain.back() * data_size; - - // If scalar tile executes one time, ptr doesn't move on 1 value - // so we should absolutelly decrease offset - if (exec_domain.back() % vector_size == 1) { - sch_offsets_in[i] += data_size; - } - } - } + outputShapeIsBlocked.resize(outputShapes.size(), false); + ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes; + for (size_t i = 0; i < outputShapes.size(); i++) { + auto blockedShape = edgeToBlockedShape(getChildEdgesAtPort(i)[0]); + outputShapeIsBlocked[i] = std::get<0>(blockedShape).size() != std::get<1>(blockedShape).size(); + output_blocked_shapes.push_back(blockedShape); + } - for (size_t i = 0; i < offsets_out.size(); i++) { - const int64_t offset = offsets_out[i][tensorRank - 2]; - const size_t data_size = config.outConfs[i].getMemDesc()->getPrecision().size(); - if (offset == data_size || offset == vector_size * data_size) { - sch_offsets_out[i] = offset; - } else if ((offset > data_size) || (offset == 0 && dims_out[i].back() != 1 && dims_out[i].back() != vector_size)) { - sch_offsets_out[i] = offset - exec_domain.back() * data_size; - - // If scalar tile executes one time, ptr doesn't move on 1 value - // so we should absolutelly decrease offset - if (exec_domain.back() % vector_size == 1) { - sch_offsets_out[i] += data_size; - } + const auto canonicalShape = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes); + return canonicalShape; +} +void Snippet::createPrimitive() { + // determine canonicalize, determine master_shape and prepend up to 6D + // NB! normInputShapes are updated, so body reshape might be needed + const auto& canonicalShape = canonicalizeBody(); + // initialize by maximum output dimension. Dimensions of outputs should be broadcastable + tensorRank = std::max(static_cast(rank6D), canonicalShape.size()); + + const auto config = getSelectedPrimitiveDescriptor()->getConfig(); + auto initDataSizes = [this, config]() { + const size_t numInputs = inputShapes.size(); + const size_t numOutputs = outputShapes.size(); + dataSize.resize(numInputs + numOutputs); + for (size_t i = 0; i < numInputs; i++) + dataSize[i] = config.inConfs[i].getMemDesc()->getPrecision().size(); + for (size_t i = 0; i < numOutputs; i++) + dataSize[i + numInputs] = config.outConfs[i].getMemDesc()->getPrecision().size(); + }; + initDataSizes(); + + jit_snippets_compile_args jcp; + if (canonicalShape.is_dynamic()) + IE_THROW() << "Snippets: Canonicalization returned dynamic shape in static pipeline"; + masterShape = canonicalShape.get_shape(); + const auto &body = snippet->get_body(); + for (const auto& p : body->get_parameters()) + normInputShapes.emplace_back(p->get_output_shape(0)); + for (const auto& r : body->get_results()) + normOutputShapes.emplace_back(r->get_input_shape(0)); + + prepareParams(); + jcp.master_shape = masterShape; + std::copy(data_offsets.begin(), data_offsets.end(), jcp.data_offsets); + generate(&jcp); +} + +std::vector Snippet::shapeInfer() const { + // todo: it's very strange that we don't have broadcast_merge_into for cpu shapes + auto broadcast_merge = [](VectorDims& dst, const VectorDims& src){ + // Ranks are both static. + auto dst_rank = dst.size(); + auto src_rank = src.size(); + const auto new_rank = std::max(dst_rank, src_rank); + dst.insert(dst.begin(), new_rank - dst_rank, 1); + std::vector dims(new_rank); + bool success = true; + for (int64_t i = 0; i < new_rank; i++) { + auto dsti = i < (new_rank - dst_rank) ? 1 : dst[i - (new_rank - dst_rank)]; + auto srci = i < (new_rank - src_rank) ? 1 : src[i - (new_rank - src_rank)]; + if (dsti != srci && srci != Shape::UNDEFINED_DIM) { + if (dsti == 1 || dsti == Shape::UNDEFINED_DIM) { + dsti = srci; + } else { + success = false; } } } + return success; }; + for (size_t i = 0; i < getParentEdges().size(); i++) { + VectorDims inDims {getParentEdgesAtPort(i)[0]->getMemory().GetShape().getDims()}; + if (masterShapeIsBlocked && !inputShapeIsBlocked[i]) + inDims.insert(inDims.end(), 1); + // todo: this is a simple master_shape inference for shape-agnostic operations, + // we'll need to account for body operations semantics in the future + if (i == 0) + masterShape = inDims; + else + broadcast_merge(masterShape, inDims); + normInputShapes[i] = std::move(inDims); + } + if (std::any_of(masterShape.begin(), masterShape.end(), [](const Dim& d){ return d == Shape::UNDEFINED_DIM;})) { + std::ostringstream errorMessage; + errorMessage << "Can't compute static master shape for Snippet node with name: " << getName(); + errorMessage << ". Input shapes = ( "; + for (size_t i = 0; i < getParentEdges().size(); i++) { + errorMessage << i << " port = " << getParentEdgesAtPort(i)[0]->getMemory().GetShape().toString() << ", "; + } + errorMessage << "). Master shape = ( " << masterShape << " )"; + IE_THROW() << errorMessage.str(); + } - fullWorkAmount = 1; - for (const auto &d : exec_domain) { - fullWorkAmount *= d; + if (normOutputShapes.size() == 1) { + normOutputShapes[0] = masterShape; + return {masterShape}; } + std::vector outputDims; + std::vector new_shapes; + for (const auto& s : normInputShapes) + new_shapes.emplace_back(s); + const auto& outputShapes = snippet->reshape_body(new_shapes); + for (size_t i = 0; i < outputShapes.size(); i++) + normOutputShapes[i] = outputShapes[i]; + return normOutputShapes; +} - batchDimIdx = tensorRank - exec_domain.size(); - // Note that exec_domain can be modified inside find_dims_to_collapse() and/or initSchedulingInfo() - find_dims_to_collapse(); +void Snippet::prepareParams() { + // here must be all the stuff that could only be done for static shapes, e.g. offset calculation + // Here it must be all the stuff that could be done once for both static and dynamic shapes + + masterShape = getNormalizedDimsBySize(masterShape, tensorRank); + for (auto& pshape : normInputShapes) + pshape = getNormalizedDimsBySize(pshape, tensorRank); + for (auto& pshape : normOutputShapes) + pshape = getNormalizedDimsBySize(pshape, tensorRank); + + tileRank = 1; + fullWorkAmount = std::accumulate(masterShape.begin(), masterShape.end(), 1, std::multiplies()); + // optimizeExecDomain will collapse shape dimensions and adjust tile Rank + optimizeExecDomain(normInputShapes, normOutputShapes, masterShape, tileRank); + exec_domain = masterShape; + + // todo: probably better to pass a call_args instance + calcJITParams(data_offsets); + auto initStartMemoryOffsets = [this]() { + const auto config = getSelectedPrimitiveDescriptor()->getConfig(); + const size_t numInputs = inputShapes.size(); + start_offset_in.resize(numInputs); + srcMemPtrs.resize(numInputs); + for (size_t i = 0; i < numInputs; i++) { + const auto memPtr = getParentEdgeAt(i)->getMemoryPtr(); + srcMemPtrs[i] = memPtr; + start_offset_in[i] = memPtr->GetDescWithType()->getOffsetPadding() * dataSize[i]; + } + const size_t numOutputs = outputShapes.size(); + start_offset_out.resize(numOutputs); + dstMemPtrs.resize(numOutputs); + for (size_t i = 0; i < numOutputs; i++) { + const auto memPtr = getChildEdgeAt(i)->getMemoryPtr(); + dstMemPtrs[i] = memPtr; + start_offset_out[i] = memPtr->GetDescWithType()->getOffsetPadding() * dataSize[i + numInputs]; + } + }; + // initialize start offsets to src and dst memory + // Needs to be done for every set of input shapes sce memory ptrs could've updated + initStartMemoryOffsets(); + std::vector scheduler_work_amounts; + // rename schedulerWorkAmount to harnessWorkAmount? + harnessWorkAmount = fullWorkAmount; + const auto rank = exec_domain.size(); + for (auto i = rank - tileRank; i < rank; i++) { + auto& dim = exec_domain[i]; + harnessWorkAmount /= dim; + scheduler_work_amounts.push_back(dim); + dim = 1; + } - initOffsets(); - initSchedulingInfo(); + std::vector new_shapes; + for (const auto& s : normInputShapes) { + ov::Shape ns(tileRank, 0); + const int offset = s.size() - tileRank; + // todo: this check is excessive, remove it before merge + if (offset < 0) + IE_THROW() << "Error during creating reduced body shapes: tileRank is larger than the input size"; + std::copy(s.begin() + offset, s.end(), ns.begin()); + new_shapes.emplace_back(std::move(ns)); + } + snippet->set_master_shape(PartialShape(scheduler_work_amounts)); + snippet->reshape_body(new_shapes); } -void Snippet::generate() { - jit_snippets_compile_args jcp; - jcp.output_dims = exec_domain; - std::copy(sch_dims.begin(), sch_dims.end(), jcp.scheduler_dims); - std::copy(sch_offsets_in.begin(), sch_offsets_in.end(), jcp.scheduler_offsets); - std::copy(sch_offsets_out.begin(), sch_offsets_out.end(), &jcp.scheduler_offsets[sch_offsets_in.size()]); - size_t harness_num_dims = jcp.output_dims.size() - 1; - if (harness_num_dims > SNIPPETS_MAX_HARNESS_DIMS) { - canUseOptimizedImpl = false; - harness_num_dims = SNIPPETS_MAX_HARNESS_DIMS; +bool Snippet::needPrepareParams() const { + return inputShapesModified() || !schedule.ptr; +} + +void Snippet::updateSrcDstPtrs(jit_snippets_call_args& call_args) const { + for (size_t i = 0; i < srcMemPtrs.size(); i++) + call_args.src_ptrs[i] = reinterpret_cast(srcMemPtrs[i]->GetData()) + start_offset_in[i]; + + for (size_t i = 0; i < dstMemPtrs.size(); i++) + call_args.dst_ptrs[i] = reinterpret_cast(dstMemPtrs[i]->GetData()) + start_offset_out[i]; +} + +void Snippet::execute(dnnl::stream strm) { + if (schedule.ptr == nullptr) { + IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference"; } - for (size_t i = 0; i < inputShapes.size(); i++) { - auto b = offsets_in[i].begin(); - std::copy(b, b + harness_num_dims, &jcp.data_offsets[i * harness_num_dims]); + jit_snippets_call_args call_args; + updateSrcDstPtrs(call_args); + + if (tensorRank == rank6D) { + schedule_6d(call_args); + } else { + schedule_nt(call_args); } - for (size_t i = 0; i < outputShapes.size(); i++) { - auto b = offsets_out[i].begin(); - std::copy(b, b + harness_num_dims, &jcp.data_offsets[(inputShapes.size() + i) * harness_num_dims]); +} + +bool Snippet::canBeInPlace() const { + if (isDynamic || getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) { + return false; } + if (getChildEdges().size() != 1) { + return false; + } + + for (auto& parentEdge : getParentEdges()) { + auto parent = parentEdge.lock()->getParent(); + if (parent->getChildEdges().size() != 1) + return false; + // WA to prevent memory corruption caused by inplace feature + if (parent->getType() == Type::Concatenation) { + for (auto& parentParentEdge : parent->getParentEdges()) { + auto parentParent = parentParentEdge.lock()->getParent(); + if (parentParent->getChildEdges().size() != 1) + return false; + } + } + } + return getInputShapeAtPort(0) == getOutputShapeAtPort(0); +} + +bool Snippet::created() const { + return getType() == Type::Subgraph; +} + +void Snippet::generate(const jit_snippets_compile_args* jcp) { ov::pass::Manager optManager; optManager.register_pass(); optManager.register_pass(); @@ -518,8 +569,7 @@ void Snippet::generate() { return convert->get_input_element_type(0) != ov::element::f32; return true; }); - - schedule = snippet->generate(optManager, reinterpret_cast(&jcp)); + schedule = snippet->generate(optManager, reinterpret_cast(jcp)); } void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const { @@ -536,7 +586,7 @@ void Snippet::schedule_nt(const jit_snippets_call_args& call_args) const { const auto& work_size = exec_domain; parallel_nt(0, [&](const int ithr, const int nthr) { size_t start = 0, end = 0; - splitter(schedulerWorkAmount, nthr, ithr, start, end); + splitter(harnessWorkAmount, nthr, ithr, start, end); std::vector indexes(work_size.size() - 1, 0); for (size_t iwork = start; iwork < end; ++iwork) { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 9c302555fb6823..0dc2354f02cd53 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -38,6 +38,9 @@ class Snippet : public Node { // Here we convert to canonical for & jit everything void createPrimitive() override; + void prepareParams() override; + std::vector shapeInfer() const override; + bool needPrepareParams() const override; bool canBeInPlace() const override; bool created() const override; @@ -55,10 +58,12 @@ class Snippet : public Node { // NOTE: Before call mutex should be initialized void copy_snippet(); - void define_schedule(); - - void generate(); + ov::PartialShape canonicalizeBody(); + void optimizeExecDomain(std::vector&, std::vector&, VectorDims&, size_t&) const; + void calcJITParams(std::vector& offsets) const; + void generate(const jit_snippets_compile_args*); + void updateSrcDstPtrs(jit_snippets_call_args&) const; // Evaluates generated snippet using parallel backend void schedule_6d(const jit_snippets_call_args& const_args) const; void schedule_nt(const jit_snippets_call_args& const_args) const; @@ -73,34 +78,36 @@ class Snippet : public Node { // Holds ISA version used is codeGeneration target dnnl::impl::cpu::x64::cpu_isa_t host_isa; + size_t isa_num_lanes; // number of elements that fit in vector size // Holds index of output used as in execution domain // it should be compatible with a schedule's work size std::vector exec_domain = {}; /// scheduling info - size_t batchDimIdx = 0; size_t tensorRank = 0; size_t tileRank = 1; size_t fullWorkAmount = 0; - size_t schedulerWorkAmount = 0; + size_t harnessWorkAmount = 0; const size_t maxTileRank = 2; std::vector srcMemPtrs = {}; std::vector dstMemPtrs = {}; + std::vector dataSize = {}; - std::vector> dims_in = {}; - std::vector> offsets_in = {}; - std::vector start_offset_in = {}; - std::vector start_offset_out = {}; + std::vector data_offsets; + // this is needed for fast shape inference of blocking-invariant prepended shapes + std::vector inputShapeIsBlocked = {}; // we need this info to shape-infer mixed layouts + std::vector outputShapeIsBlocked = {}; // we need this info to shape-infer mixed layouts + bool masterShapeIsBlocked = false; - std::vector> dims_out = {}; - std::vector> offsets_out = {}; + // master shape is mutable since we need to modify it inside const shapeInfer method + mutable VectorDims masterShape = {}; + mutable std::vector normInputShapes = {}; + mutable std::vector normOutputShapes = {}; - std::vector sch_dims = {}; - std::vector sch_offsets_in = {}; - std::vector sch_offsets_out = {}; - bool canUseOptimizedImpl = true; + std::vector start_offset_in = {}; + std::vector start_offset_out = {}; }; } // namespace node diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index d9252bf2ecbe21..806dede4417a89 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -650,7 +650,6 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr if (n->inputs().size() > 1 && !ov::is_type(n->get_input_node_shared_ptr(1))) return true; } - const auto& inputs = n->inputs(); // todo: clarify whether we can evaluate snippets on const paths const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp index ebc3685c80a3c3..9c5c2a9904a48d 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp @@ -22,15 +22,40 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add, ::testing::Values(CommonTestUtils::DEVICE_CPU)), Add::getTestCaseName); + +namespace snippets_static_1 { +// These inputs are needed to test static Loop optimizations (emit the whole tile, body with increments, set WA etc) +std::vector inShapesStatic1{{1, 16, 29, 1}, {1, 16, 29, 7}, {1, 16, 29, 8}, {1, 16, 29, 15}, {1, 16, 29, 16}, {1, 16, 29, 31}}; +std::vector inShapesStatic2{{1, 16, 29, 1}, {1, 16, 1, 1}, {1, 1, 1, 1}}; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh, - ::testing::Combine( - ::testing::Values(ov::Shape {1, 42, 16, 64}), - ::testing::Values(ov::Shape {1, 42, 16, 1}), - ::testing::Values(ov::element::f32), - ::testing::Values(3), // Add + 2 sinh after inputs - ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts - ::testing::Values(CommonTestUtils::DEVICE_CPU)), + ::testing::Combine( + ::testing::ValuesIn(inShapesStatic1), + ::testing::ValuesIn(inShapesStatic2), + ::testing::Values(ov::element::f32), + ::testing::Values(3), // Add + 2 converts after inputs + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(CommonTestUtils::DEVICE_CPU)), AddSinh::getTestCaseName); +// test cross-tile (vector vs scalar) optimizations in the absence of vector tile +std::vector> inShapesStatic{ + {{1, 128, 1, 1}, {1, 128, 1, 1}}, + {{1, 128, 1, 9}, {1, 128, 1, 9}}, + {{1, 128, 1, 17}, {1, 128, 1, 17}}, + {{1, 128, 1, 29}, {1, 128, 1, 29}}, + {{1, 128, 1, 33}, {1, 128, 1, 33}}, + {{1, 128, 9, 30}, {1, 128, 1, 30}}, + {{1, 128, 9, 1}, {1, 128, 1, 30}}, +}; +INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinhPair, + ::testing::Combine( + ::testing::ValuesIn(inShapesStatic), + ::testing::Values(ov::element::f32), + ::testing::Values(3), // Add + 2 converts after inputs + ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts + ::testing::Values(CommonTestUtils::DEVICE_CPU)), + AddSinhPair::getTestCaseName); + +} // namespace snippets_static_1 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinhConst, ::testing::Combine( diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp index 323e069ebc0a5b..4b32e9ded8657b 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp @@ -30,10 +30,10 @@ const std::vector, std::vector> inputShapes_Convert = { - { ov::Shape{2, 16} }, - { ov::Shape{5, 5} }, - { ov::Shape{2, 12, 1} } +const std::vector> inputShapes_Convert = { + { ov::PartialShape{2, 16} }, + { ov::PartialShape{5, 5} }, + { ov::PartialShape{2, 12, 1} } }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Convert, Convert, @@ -57,10 +57,10 @@ const std::vector, std::vector> inputShapes_ConvertInput = { - { ov::Shape{2, 16}, ov::Shape{1, 16} }, - { ov::Shape{5, 18}, ov::Shape{5, 1} }, - { ov::Shape{3, 1}, ov::Shape{3, 21} } +const std::vector> inputShapes_ConvertInput = { + { ov::PartialShape{2, 16}, ov::PartialShape{1, 16} }, + { ov::PartialShape{5, 18}, ov::PartialShape{5, 1} }, + { ov::PartialShape{3, 1}, ov::PartialShape{3, 21} } }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertInput, ConvertInput, @@ -94,10 +94,10 @@ const std::vector, std::vector> inputShapes_ConvertPartialInputsAndResults = { - { ov::Shape{2, 16}, ov::Shape{1, 16}, ov::Shape{1, 1} }, - { ov::Shape{5, 18}, ov::Shape{5, 1}, ov::Shape{1, 18} }, - { ov::Shape{3, 1}, ov::Shape{3, 21}, ov::Shape{3, 1} } +const std::vector> inputShapes_ConvertPartialInputsAndResults = { + { ov::PartialShape{2, 16}, ov::PartialShape{1, 16}, ov::PartialShape{1, 1} }, + { ov::PartialShape{5, 18}, ov::PartialShape{5, 1}, ov::PartialShape{1, 18} }, + { ov::PartialShape{3, 1}, ov::PartialShape{3, 21}, ov::PartialShape{3, 1} } }; INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertPartialInputsAndResults, ConvertPartialInputsAndResults, @@ -117,7 +117,7 @@ const std::vector, std::vector{ov::Shape{5, 5, 5, 5}}), + ::testing::Values(std::vector{{5, 5, 5, 5}}), ::testing::ValuesIn(types_ConvertMany), ::testing::Values(2), ::testing::Values(1), @@ -126,7 +126,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputs, ConvertManyOnInputs INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnOutputs, ConvertManyOnOutputs, ::testing::Combine( - ::testing::Values(std::vector{ov::Shape{5, 5, 5, 5}}), + ::testing::Values(std::vector{{5, 5, 5, 5}}), ::testing::ValuesIn(types_ConvertMany), ::testing::Values(2), // sinh + subgraph ::testing::Values(1), @@ -140,7 +140,7 @@ const std::vector, std::vector{ov::Shape{5, 5, 5, 5}}), + ::testing::Values(std::vector{{5, 5, 5, 5}}), ::testing::ValuesIn(types_ConvertManyIO), ::testing::Values(2), // sinh + subgraph ::testing::Values(1), diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp index fa182cf548a937..90392416e4591c 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp @@ -10,7 +10,7 @@ namespace test { namespace snippets { namespace { -const std::vector> input_shapes = { +const std::vector> input_shapes = { { {5, 5, 256, 1}, {5, 5, 256, 1} }, { {5, 5, 16, 35}, {5, 5, 16, 35} }, { {5, 5, 256, 1}, {5, 5, 256, 35} }, @@ -26,7 +26,6 @@ const std::vector> input_shapes = { { {5, 5, 35, 17}, {5, 5, 35, 17} }, { {5, 5, 35, 17}, {5, 5, 1, 17} }, - { {5, 5, 35, 18}, {5, 5, 35, 18} }, { {5, 5, 35, 18}, {5, 5, 1, 18} }, }; diff --git a/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp b/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp index c02eb1a2a45de8..9aab3ffdfe7a01 100644 --- a/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp +++ b/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp @@ -31,7 +31,7 @@ TEST_F(SnippetsMarkSkippedTests, smoke_Snippets_SkipAfterInputsEltwise) { } TEST_F(SnippetsMarkSkippedTests, smoke_Snippets_SkipAfterInputsMatMulEltwise) { - const auto &f = MatMulEltwiseBranchesFunction(std::vector {{1, 3, 4, 4}, {1, 3, 4, 4}}); + const auto &f = MatMulEltwiseBranchesFunction(std::vector {{1, 3, 4, 4}, {1, 3, 4, 4}}); function = f.getOriginal(); // Fully tokenizable, since inputs are followed by MatMul function_ref = f.getReference(); @@ -42,7 +42,7 @@ TEST_F(SnippetsMarkSkippedTests, smoke_Snippets_SkipConvFused_ConvMulActivation) std::vector> eltwiseOps {std::make_shared(), std::make_shared(), std::make_shared()}; - std::vector inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}}; + std::vector inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}}; const auto &f = ConvMulActivationFunction(inputShapes, eltwiseOps); function = f.getOriginal(); // Fully tokenizable, since Mul with 2 inputs isn't fused into Convolution @@ -54,7 +54,7 @@ TEST_F(SnippetsMarkSkippedTests, smoke_SkipConvFused_ConvSumActivation) { std::vector> eltwiseOps {std::make_shared(), std::make_shared(), std::make_shared()}; - std::vector inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}}; + std::vector inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}}; const auto &f = ConvMulActivationFunction(inputShapes, eltwiseOps); function = f.getOriginal(); // Not tokenizable, since Add + Eltwises can be fused into Convolution diff --git a/src/tests/functional/plugin/shared/include/snippets/add.hpp b/src/tests/functional/plugin/shared/include/snippets/add.hpp index 84338e53215f3a..3f19a02737980c 100644 --- a/src/tests/functional/plugin/shared/include/snippets/add.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/add.hpp @@ -19,6 +19,14 @@ typedef std::tuple< std::string // Target Device > AddParams; +typedef std::tuple< + std::vector, // Input 0, Input 1 Shape + ov::element::Type, // Element type + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device +> AddParamsPair; + typedef std::tuple< ov::Shape, // Input 0 Shape ov::element::Type, // Element type @@ -41,6 +49,15 @@ class AddSinh : public Add { void SetUp() override; }; +// repack AddSinh input shapes into shape vector to cover some cases easier +class AddSinhPair : public testing::WithParamInterface, + virtual public ov::test::SnippetsTestsCommon { +public: + static std::string getTestCaseName(testing::TestParamInfo obj); +protected: + void SetUp() override; +}; + class AddSinhConst : public testing::WithParamInterface, virtual public ov::test::SnippetsTestsCommon { public: diff --git a/src/tests/functional/plugin/shared/include/snippets/convert.hpp b/src/tests/functional/plugin/shared/include/snippets/convert.hpp index bd4d7641711a0a..fe534480fc4268 100644 --- a/src/tests/functional/plugin/shared/include/snippets/convert.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/convert.hpp @@ -11,7 +11,7 @@ namespace test { namespace snippets { typedef std::tuple< - std::vector, // InputShapes + std::vector, // InputShapes std::pair, std::vector>, // Input and Output data types for Converts size_t, // Expected num nodes size_t, // Expected num subgraphs diff --git a/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp b/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp index bb39b7ded31678..a8b3c202b78a0c 100644 --- a/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp @@ -19,6 +19,15 @@ typedef std::tuple< std::string // Target Device > ThreeInputsEltwiseParams; +typedef std::tuple< + InputShape, // Input 0 Shape + InputShape, // Input 1 Shape + InputShape, // Input 2 Shape + size_t, // Expected num nodes + size_t, // Expected num subgraphs + std::string // Target Device + > ThreeInputsEltwiseDynamicParams; + class ThreeInputsEltwise : public testing::WithParamInterface, virtual public ov::test::SnippetsTestsCommon { public: @@ -33,7 +42,6 @@ class ThreeInputsEltwiseSinh : public ThreeInputsEltwise { void SetUp() override; }; - } // namespace snippets } // namespace test } // namespace ov diff --git a/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp index 0a209de2fe9244..4284ceacfa4541 100644 --- a/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp +++ b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp @@ -11,7 +11,7 @@ namespace test { namespace snippets { typedef std::tuple< - std::vector, // Input Shape All shapes + std::vector, // Input Shape All shapes size_t, // Expected num nodes size_t, // Expected num subgraphs std::string // Target Device diff --git a/src/tests/functional/plugin/shared/src/snippets/add.cpp b/src/tests/functional/plugin/shared/src/snippets/add.cpp index c524a54539f304..f0a1908ef2a9e8 100644 --- a/src/tests/functional/plugin/shared/src/snippets/add.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/add.cpp @@ -5,6 +5,8 @@ #include "common_test_utils/common_utils.hpp" #include "snippets/add.hpp" #include "subgraph_simple.hpp" +#include "ngraph_functions/builders.hpp" +#include "functional_test_utils/skip_tests_config.hpp" namespace ov { namespace test { @@ -87,6 +89,38 @@ void AddRollConst::SetUp() { setInferenceType(type); } +std::string AddSinhPair::getTestCaseName(testing::TestParamInfo obj) { + std::vector input_shapes; + ov::element::Type type; + std::string targetDevice; + size_t num_nodes, num_subgraphs; + std::tie(input_shapes, type, num_nodes, num_subgraphs, targetDevice) = obj.param; + if (input_shapes.size() != 2) + IE_THROW() << "Invalid input shapes vector size"; + std::ostringstream result; + result << "IS[0]=" << CommonTestUtils::vec2str(input_shapes[0]) << "_"; + result << "IS[1]=" << CommonTestUtils::vec2str(input_shapes[1]) << "_"; + result << "T=" << type << "_"; + result << "#N=" << num_nodes << "_"; + result << "#S=" << num_subgraphs << "_"; + result << "targetDevice=" << targetDevice; + return result.str(); +} + +void AddSinhPair::SetUp() { + std::vector input_shapes; + ov::element::Type type; + std::tie(input_shapes, type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); + std::vector is; + for (const auto& s : input_shapes) { + is.emplace_back(InputShape {{}, {s, }}); + } + init_input_shapes(is); + auto f = ov::test::snippets::AddSinhFunction({input_shapes[0], input_shapes[1]}); + function = f.getOriginal(); + setInferenceType(type); +} + TEST_P(Add, CompareWithRefImpl) { run(); validateNumSubgraphs(); @@ -107,6 +141,10 @@ TEST_P(AddRollConst, CompareWithRefImpl) { validateNumSubgraphs(); } +TEST_P(AddSinhPair, CompareWithRefImpl) { + run(); + validateNumSubgraphs(); +} } // namespace snippets } // namespace test diff --git a/src/tests/functional/plugin/shared/src/snippets/convert.cpp b/src/tests/functional/plugin/shared/src/snippets/convert.cpp index b4c5c840cb6869..9f9c343d351ace 100644 --- a/src/tests/functional/plugin/shared/src/snippets/convert.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/convert.cpp @@ -12,7 +12,7 @@ namespace test { namespace snippets { std::string Convert::getTestCaseName(testing::TestParamInfo obj) { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::string targetDevice; size_t num_nodes, num_subgraphs; @@ -21,7 +21,7 @@ std::string Convert::getTestCaseName(testing::TestParamInfo inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); - + init_input_shapes(dynamic_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertFunction(inputShape, types.first[0], types.second[0]); function = f.getOriginal(); output_type = types.second.front(); @@ -85,11 +84,10 @@ void Convert::generate_inputs(const std::vector& targetInputStaticSha } void ConvertInput::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); - + init_input_shapes(dynamic_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertInputFunction(inputShape, types.first[0], types.second[0]); function = f.getOriginal(); } @@ -125,10 +123,10 @@ parameters ConvertInput::generate_params_random() const { } void ConvertOutput::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(dynamic_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertOutputFunction(inputShape, types.first[0], types.second[0]); function = f.getOriginal(); @@ -136,10 +134,10 @@ void ConvertOutput::SetUp() { } void ConvertStub::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(dynamic_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertStubFunction(inputShape, types.first[0], types.second[0]); function = f.getOriginal(); @@ -147,40 +145,40 @@ void ConvertStub::SetUp() { } void ConvertPartialInputsAndResults::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(dynamic_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertPartialInputsAndResultsFunction(inputShape, types.first, types.second); function = f.getOriginal(); } void ConvertManyOnInputs::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(dynamic_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertManyOnInputsFunction(inputShape, types.first); function = f.getOriginal(); } void ConvertManyOnOutputs::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(dynamic_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertManyOnOutputsFunction(inputShape, types.first); function = f.getOriginal(); } void ConvertManyOnInputOutput::SetUp() { - std::vector inputShape; + std::vector inputShape; std::pair, std::vector> types; std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(dynamic_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::ConvertManyOnInputOutputFunction(inputShape, types.first, types.second); function = f.getOriginal(); diff --git a/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp index 1140937be63359..221490bb00017c 100644 --- a/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp @@ -27,10 +27,10 @@ std::string MaxNumParamsEltwiseSinh::getTestCaseName(testing::TestParamInfoGetParam(); - std::vector expandedShapes(10, inputShape); + std::vector expandedShapes(10, inputShape); std::vector input_shapes; for (const auto& s : expandedShapes) { - input_shapes.emplace_back(InputShape {{}, {s, }}); + input_shapes.emplace_back(InputShape {{}, {s.get_shape(), }}); } init_input_shapes(input_shapes); diff --git a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp index 276218e6150c57..b2ebed8e6f1ccc 100644 --- a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp @@ -5,6 +5,7 @@ #include "common_test_utils/common_utils.hpp" #include "snippets/three_inputs_eltwise.hpp" #include "subgraph_simple.hpp" +#include "functional_test_utils/skip_tests_config.hpp" namespace ov { namespace test { diff --git a/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp index 205587e1a30f97..2e4ae1b0643adc 100644 --- a/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp +++ b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp @@ -11,14 +11,14 @@ namespace test { namespace snippets { std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo obj) { - std::vector inputShapes; + std::vector inputShapes; std::string targetDevice; size_t num_nodes, num_subgraphs; std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param; std::ostringstream result; for (auto i = 0; i < inputShapes.size(); i++) - result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_"; + result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i].get_shape()) << "_"; result << "#N=" << num_nodes << "_"; result << "#S=" << num_subgraphs << "_"; result << "targetDevice=" << targetDevice; @@ -26,9 +26,9 @@ std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo inputShape; + std::vector inputShape; std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam(); - init_input_shapes(static_shapes_to_test_representation(inputShape)); + init_input_shapes(dynamic_shapes_to_test_representation(inputShape)); auto f = ov::test::snippets::TwoInputsAndOutputsFunction(inputShape); function = f.getOriginal(); } diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp index 2d1038d272b8a9..80ab2ce90f28d5 100644 --- a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp +++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp @@ -61,6 +61,14 @@ class SubgraphBaseTest : public CommonTestUtils::TestsCommon { virtual std::vector get_plugin_outputs(); }; +inline std::vector dynamic_shapes_to_test_representation(const std::vector& shapes) { + std::vector result; + for (const auto& staticShape : shapes) { + result.push_back({{staticShape}, {staticShape.get_shape()}}); + } + return result; +} + inline std::vector> static_shapes_to_test_representation(const std::vector>& shapes) { std::vector> result; for (const auto& staticShapes : shapes) { diff --git a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp index 237e3b57dd4484..5ab13dafe68dff 100644 --- a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp +++ b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp @@ -43,9 +43,15 @@ ov::runtime::Tensor generate(const std::shared_ptr& node, } namespace Activation { +// todo: this is a bug fixed! Merge it separately. +// Default parameters InputGenerateData(10, 20, 32768, 1) lead to input generation according to 10 + x/32768, +// where x {0, 20}, so all generated values are in the range [10, 10 + 6.1e-4]. +// Thus all the interval more-or-less fall within the uncertainty validation interval +// Fix let the range be at least 20x of resolution ov::runtime::Tensor generate(const ov::element::Type& elemType, const ov::Shape& targetShape, - InputGenerateData inGenData = InputGenerateData(10, 20, 32768, 1)) { +// InputGenerateData inGenData = InputGenerateData(10, 20, 32768, 1)) { + InputGenerateData inGenData = InputGenerateData(-1, 2*32768, 32768, 1)) { if (!elemType.is_signed()) { inGenData.range = 15; inGenData.start_from = 0; diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp index 505829b9fd20dd..08e3dfc9c859e8 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp @@ -18,7 +18,7 @@ class SnippetsFunctionBase { public: SnippetsFunctionBase() = delete; - explicit SnippetsFunctionBase(const std::vector& inputShapes, ov::element::Type_t precision = element::f32) + explicit SnippetsFunctionBase(const std::vector& inputShapes, ov::element::Type_t precision = element::f32) : input_shapes{inputShapes}, precision{precision} {}; std::shared_ptr getReference() const { @@ -53,7 +53,7 @@ class SnippetsFunctionBase { } const ov::element::Type_t precision; - const std::vector input_shapes; + const std::vector input_shapes; virtual void validate_function(const std::shared_ptr &f) const; }; @@ -67,7 +67,7 @@ class SnippetsFunctionBase { class SnippetsFunctionCustomizable : public SnippetsFunctionBase { public: SnippetsFunctionCustomizable() = delete; - SnippetsFunctionCustomizable(const std::vector& inputShapes, + SnippetsFunctionCustomizable(const std::vector& inputShapes, const std::vector>& customOps, const std::vector&& customOpsNumInputs); diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp index a7c6bd34e0f58e..526234409b348e 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp @@ -22,7 +22,7 @@ namespace snippets { // Result class ConvertFunction : public SnippetsFunctionBase { public: - explicit ConvertFunction(const std::vector& inputShapes, + explicit ConvertFunction(const std::vector& inputShapes, const ov::element::Type inType = ov::element::f32, const ov::element::Type outType = ov::element::u8) : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { @@ -45,7 +45,7 @@ class ConvertFunction : public SnippetsFunctionBase { // Result class ConvertInputFunction : public SnippetsFunctionBase { public: - explicit ConvertInputFunction(const std::vector& inputShapes, + explicit ConvertInputFunction(const std::vector& inputShapes, const ov::element::Type inType = ov::element::f32, const ov::element::Type outType = ov::element::u8) : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { @@ -67,7 +67,7 @@ class ConvertInputFunction : public SnippetsFunctionBase { // Result class ConvertOutputFunction : public SnippetsFunctionBase { public: - explicit ConvertOutputFunction(const std::vector& inputShapes, + explicit ConvertOutputFunction(const std::vector& inputShapes, const ov::element::Type inType = ov::element::f32, const ov::element::Type outType = ov::element::i8) : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { @@ -91,7 +91,7 @@ class ConvertOutputFunction : public SnippetsFunctionBase { // Result Result class ConvertStubFunction : public SnippetsFunctionBase { public: - explicit ConvertStubFunction(const std::vector& inputShapes, + explicit ConvertStubFunction(const std::vector& inputShapes, const ov::element::Type inType = ov::element::f32, const ov::element::Type outType = ov::element::i8) : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) { @@ -117,7 +117,7 @@ class ConvertStubFunction : public SnippetsFunctionBase { // Result2 class ConvertPartialInputsAndResultsFunction : public SnippetsFunctionBase { public: - explicit ConvertPartialInputsAndResultsFunction(const std::vector& inputShapes, + explicit ConvertPartialInputsAndResultsFunction(const std::vector& inputShapes, const std::vector& inTypes = {ov::element::f32}, const std::vector& outTypes = {ov::element::f32}) : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) { @@ -142,7 +142,7 @@ class ConvertPartialInputsAndResultsFunction : public SnippetsFunctionBase { // Result class ConvertManyOnInputsFunction : public SnippetsFunctionBase { public: - explicit ConvertManyOnInputsFunction(const std::vector& inputShapes, const std::vector& types) + explicit ConvertManyOnInputsFunction(const std::vector& inputShapes, const std::vector& types) : SnippetsFunctionBase(inputShapes), types(types) { NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types"); @@ -165,7 +165,7 @@ class ConvertManyOnInputsFunction : public SnippetsFunctionBase { // Result Result class ConvertManyOnOutputsFunction : public SnippetsFunctionBase { public: - explicit ConvertManyOnOutputsFunction(const std::vector& inputShapes, const std::vector& types) + explicit ConvertManyOnOutputsFunction(const std::vector& inputShapes, const std::vector& types) : SnippetsFunctionBase(inputShapes), types(types) { NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types"); @@ -191,7 +191,7 @@ class ConvertManyOnOutputsFunction : public SnippetsFunctionBase { // Result Result class ConvertManyOnInputOutputFunction : public SnippetsFunctionBase { public: - explicit ConvertManyOnInputOutputFunction(const std::vector& inputShapes, + explicit ConvertManyOnInputOutputFunction(const std::vector& inputShapes, const std::vector& inTypes, const std::vector& outTypes) : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) { diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp index b663c22671f30a..3cbcfdac4a5af6 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp @@ -28,7 +28,7 @@ namespace snippets { // Result class ConvMulActivationFunction : public SnippetsFunctionCustomizable { public: - explicit ConvMulActivationFunction(const std::vector& inputShapes, const std::vector>& customOps) + explicit ConvMulActivationFunction(const std::vector& inputShapes, const std::vector>& customOps) : SnippetsFunctionCustomizable(inputShapes, customOps, {2, 1, 1}) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); NGRAPH_CHECK(input_shapes[0].size() == 4, "Only 4D input shapes are currently supported"); @@ -36,6 +36,7 @@ class ConvMulActivationFunction : public SnippetsFunctionCustomizable { ov::op::util::is_unary_elementwise_arithmetic(customOps[1]) && ov::op::util::is_unary_elementwise_arithmetic(customOps[2]), "Got invalid custom ops: expected binary and two unary operations"); + NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static(), "This test supports only static shapes"); } private: std::shared_ptr initOriginal() const override; diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp index fad086acf031e1..69027e96452751 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp @@ -22,7 +22,7 @@ namespace snippets { class AddFunctionLoweredBroadcast : public AddFunction { public: - explicit AddFunctionLoweredBroadcast(const std::vector& inputShapes, const std::vector& broadcastShapes) : + explicit AddFunctionLoweredBroadcast(const std::vector& inputShapes, const std::vector& broadcastShapes) : AddFunction(inputShapes), broadcast_shapes{broadcastShapes} { NGRAPH_CHECK(input_shapes.size() == broadcast_shapes.size(), "Broadcast shapes should have the same size as input_shapes"); @@ -37,10 +37,12 @@ class AddFunctionLoweredBroadcast : public AddFunction { class EltwiseThreeInputsLoweredFunction : public EltwiseThreeInputsFunction { public: - explicit EltwiseThreeInputsLoweredFunction(const std::vector& inputShapes, const std::vector& broadcastShapes) : + explicit EltwiseThreeInputsLoweredFunction(const std::vector& inputShapes, const std::vector& broadcastShapes) : EltwiseThreeInputsFunction(inputShapes), broadcast_shapes{broadcastShapes} { NGRAPH_CHECK(input_shapes.size() == broadcast_shapes.size(), "Broadcast shapes should have the same size as input_shapes"); + NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static() && input_shapes[2].is_static(), + "Broadcast shapes should have the same size as input_shapes"); } protected: diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp index dd9f342d7b388d..a1254dfaa80521 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp @@ -22,7 +22,7 @@ namespace snippets { // Result class AddFunction : public SnippetsFunctionBase { public: - explicit AddFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit AddFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); } protected: @@ -39,7 +39,7 @@ class AddFunction : public SnippetsFunctionBase { // todo: remove Sinh once "no subgraph after input" limitation is relaxed class AddSinhFunction : public SnippetsFunctionBase { public: - explicit AddSinhFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit AddSinhFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); } protected: @@ -54,8 +54,9 @@ class AddSinhFunction : public SnippetsFunctionBase { // todo: remove Sinh once "no subgraph after input" limitation is relaxed class AddSinhConstFunction : public SnippetsFunctionBase { public: - explicit AddSinhConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit AddSinhConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + NGRAPH_CHECK(input_shapes[0].is_static(), "This test supports only static shapes"); } protected: std::shared_ptr initOriginal() const override; @@ -71,8 +72,9 @@ class AddSinhConstFunction : public SnippetsFunctionBase { // The function is needed to check different input element types (model precision change) class AddRollConstFunction : public SnippetsFunctionBase { public: - explicit AddRollConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit AddRollConstFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes"); + NGRAPH_CHECK(input_shapes[0].is_static(), "Only static shapes are supported"); } protected: std::shared_ptr initOriginal() const override; @@ -87,7 +89,7 @@ class AddRollConstFunction : public SnippetsFunctionBase { // Result class EltwiseFunction : public SnippetsFunctionBase { public: - explicit EltwiseFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit EltwiseFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); } protected: @@ -102,7 +104,7 @@ class EltwiseFunction : public SnippetsFunctionBase { // Result class EltwiseThreeInputsFunction : public SnippetsFunctionBase { public: - explicit EltwiseThreeInputsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit EltwiseThreeInputsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); } protected: @@ -113,7 +115,7 @@ class EltwiseThreeInputsFunction : public SnippetsFunctionBase { // todo: remove Sinh once "no subgraph after input" limitation is relaxed class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase { public: - explicit EltwiseThreeInputsSinhFunction(const std::vector& inputShapes) : + explicit EltwiseThreeInputsSinhFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes"); } @@ -131,7 +133,7 @@ class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase { // todo: remove Sinh once "no subgraph after input" limitation is relaxed class EltwiseMaxNumParamsSinhFunction : public SnippetsFunctionBase { public: - explicit EltwiseMaxNumParamsSinhFunction(const std::vector& inputShapes) : + explicit EltwiseMaxNumParamsSinhFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 10, "Got invalid number of input shapes"); } @@ -147,7 +149,7 @@ class EltwiseMaxNumParamsSinhFunction : public SnippetsFunctionBase { // Result class MatMulEltwiseBranchesFunction : public SnippetsFunctionBase { public: - explicit MatMulEltwiseBranchesFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit MatMulEltwiseBranchesFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); NGRAPH_CHECK(input_shapes[0].size() == 4 && input_shapes[1].size() == 4, "Only 4D input shapes are currently supported by this test"); @@ -155,6 +157,7 @@ class MatMulEltwiseBranchesFunction : public SnippetsFunctionBase { // Note that single-element constant are not supported by the test, since they'll be converted // to snippets::op::Scalar. So a more comlex logics is required to produce reference function. NGRAPH_CHECK(input_shapes[0][1] == input_shapes[1][1], "Channel dimensions must be equal and != 1"); + NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static(), "This test supports only static shapes"); } protected: @@ -170,7 +173,7 @@ class MatMulEltwiseBranchesFunction : public SnippetsFunctionBase { // Result class EltwiseLogLoopFunction : public SnippetsFunctionBase { public: - explicit EltwiseLogLoopFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit EltwiseLogLoopFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); } protected: @@ -188,7 +191,7 @@ class EltwiseLogLoopFunction : public SnippetsFunctionBase { // Result class EltwiseTwoResultsFunction : public SnippetsFunctionBase { public: - explicit EltwiseTwoResultsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit EltwiseTwoResultsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); } protected: @@ -206,7 +209,7 @@ class EltwiseTwoResultsFunction : public SnippetsFunctionBase { // Result class TwoInputsAndOutputsFunction : public SnippetsFunctionBase { public: - explicit TwoInputsAndOutputsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { + explicit TwoInputsAndOutputsFunction(const std::vector& inputShapes) : SnippetsFunctionBase(inputShapes) { NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes"); } protected: diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp index ff7cdc986a59b1..8cec4a4aca95f6 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp @@ -15,11 +15,11 @@ void SnippetsFunctionBase::validate_function(const std::shared_ptr &f) co NGRAPH_CHECK(params.size() == input_shapes.size(), "Passed input shapes and produced function are inconsistent."); for (size_t i = 0; i < input_shapes.size(); i++) - NGRAPH_CHECK(std::equal(input_shapes[i].begin(), input_shapes[i].end(), params[i]->get_shape().begin()), + NGRAPH_CHECK(std::equal(input_shapes[i].begin(), input_shapes[i].end(), params[i]->get_partial_shape().begin()), "Passed input shapes and produced function are inconsistent."); } -SnippetsFunctionCustomizable::SnippetsFunctionCustomizable(const std::vector& inputShapes, +SnippetsFunctionCustomizable::SnippetsFunctionCustomizable(const std::vector& inputShapes, const std::vector>& customOps, const std::vector&& customOpsNumInputs) : SnippetsFunctionBase(inputShapes), custom_ops{customOps}, custom_ops_num_inputs{customOpsNumInputs} { diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp index ccf1ce4081e204..9975f5185c1b61 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp @@ -13,7 +13,7 @@ namespace snippets { std::shared_ptr ConvMulActivationFunction::initOriginal() const { auto conv_param = std::make_shared(precision, input_shapes[0]); - const auto channels = input_shapes[0][1]; + const auto channels = static_cast(input_shapes[0][1].get_length()); ngraph::Shape strides(2, 1); std::vector pad_begin(2, 1), pad_end(2, 1); const Shape const_shape {channels, channels, 3, 3}; @@ -37,7 +37,7 @@ std::shared_ptr ConvMulActivationFunction::initReference() const { auto conv_param = std::make_shared(precision, input_shapes[0]); ngraph::Shape strides(2, 1); std::vector pad_begin(2, 1), pad_end(2, 1); - const auto channels = input_shapes[0][1]; + const auto channels = static_cast(input_shapes[0][1].get_length()); const Shape const_shape {channels, channels, 3, 3}; const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(const_shape), -10., 10.); auto weights = std::make_shared(precision, const_shape, const_values); diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp index 8fd664b192187d..d04db522a54881 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp @@ -6,6 +6,7 @@ #include "common_test_utils/data_utils.hpp" #include #include "ngraph_functions/builders.hpp" +#include namespace ov { namespace test { @@ -14,7 +15,7 @@ namespace snippets { std::shared_ptr AddFunctionLoweredBroadcast::initLowered() const { auto data0 = std::make_shared(precision, input_shapes[0]); std::shared_ptr add_input0 = nullptr; - if (!broadcast_shapes[0].empty() && broadcast_shapes[0].back() != input_shapes[0].back()) { + if (!broadcast_shapes[0].empty() && broadcast_shapes[0].back() != input_shapes[0].rbegin()->get_length()) { add_input0 = std::make_shared(data0, broadcast_shapes[0]); } else { add_input0 = std::make_shared(data0); @@ -22,18 +23,38 @@ std::shared_ptr AddFunctionLoweredBroadcast::initLowered() const { auto data1 = std::make_shared(precision, input_shapes[1]); std::shared_ptr add_input1 = nullptr; - if (!broadcast_shapes[1].empty() && broadcast_shapes[1].back() != input_shapes[1].back()) { + if (!broadcast_shapes[1].empty() && broadcast_shapes[1].back() != input_shapes[1].rbegin()->get_length()) { add_input1 = std::make_shared(data1, broadcast_shapes[1]); } else { add_input1 = std::make_shared(data1); } auto add = std::make_shared(add_input0, add_input1); auto store = std::make_shared(add); - return std::make_shared(NodeVector{store}, ParameterVector{data0, data1}); + ParameterVector input_params {data0, data1}; + auto model = std::make_shared(NodeVector{store}, input_params); + + // Create dummy scheduler to pass graph comparison tests + // Note that if there is more than one results, they should be reverted + ResultVector results({model->get_results()[0]}); + const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + std::vector apply_increments(input_params.size() + results.size(), true); + insertLoopEnd(results, inner_loop_begin, 1, 1, 1, apply_increments); + auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0, + [](int64_t max_val, const PartialShape& ps) { + return std::max(ps[ps.size() - 2].get_length(), max_val); + }); + if (outer_WA > 1) { + const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + insertLoopEnd(results, outer_loop_begin, 0, 1, 1, apply_increments); + } + return model; } std::shared_ptr EltwiseThreeInputsLoweredFunction::initLowered() const { // todo: implement conversion between std::vector and std::vector - auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0], input_shapes[1], input_shapes[2]}); + auto input_params = ngraph::builder::makeParams(precision, + {input_shapes[0].get_shape(), + input_shapes[1].get_shape(), + input_shapes[2].get_shape()}); auto load_or_broadcastload = [&](size_t i) -> std::shared_ptr { // user specified that no broadcasting is required if (broadcast_shapes[i].empty()) { @@ -41,7 +62,7 @@ std::shared_ptr EltwiseThreeInputsLoweredFunction::initLowered() cons // broadcasting is required: could be Load + BroadcastMove or BroiadcastLoad } else { // The last dim is processed by vector Tile, so BroadcastLoad is required if the last dim being broadcasted - if (input_shapes[i].back() == 1 && broadcast_shapes[i].back() != 1) { + if (input_shapes[i].rbegin()->get_length() == 1 && broadcast_shapes[i].back() != 1) { return std::make_shared(input_params[i], broadcast_shapes[i]); // Todo: Cover this logics with functional tests, Review FakeBroadcast Emitter // Broadcasting of other dims is handled by BroadcastMove. Strictly speaking, broadcasting is achieved via @@ -57,12 +78,6 @@ std::shared_ptr EltwiseThreeInputsLoweredFunction::initLowered() cons const std::vector const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.); auto sub_scalar = std::make_shared(precision, Shape{1}, const_values[0]); std::shared_ptr sub_load; -// Todo: Uncomment when invalid read in vector tile will be fixed -// if (input_shapes[2].back() == 1) -// sub_load = std::make_shared(input_params[2]); -// else -// sub_load = std::make_shared(input_params[2]); -// remove when the code above is enabled: sub_load = std::make_shared(input_params[2]); auto sub = std::make_shared(sub_load, sub_scalar); std::shared_ptr sub_out; @@ -72,7 +87,23 @@ std::shared_ptr EltwiseThreeInputsLoweredFunction::initLowered() cons sub_out = std::make_shared(sub, broadcast_shapes[2]); auto mul = std::make_shared(add, sub_out); auto store = std::make_shared(mul); - return std::make_shared(NodeVector{store}, input_params); + auto model = std::make_shared(NodeVector{store}, input_params); + + // Create dummy scheduler to pass graph comparison tests + // Note that if there is more than one results, they should be reverted + ResultVector results({model->get_results()[0]}); + const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + std::vector apply_increments(input_params.size() + results.size(), true); + const auto& inner_loop_end = insertLoopEnd(results, inner_loop_begin, 1, 1, 1, apply_increments); + auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0, + [](int64_t max_val, const PartialShape& ps) { + return std::max(ps[ps.size() - 2].get_length(), max_val); + }); + if (outer_WA > 1) { + const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params); + insertLoopEnd(results, outer_loop_begin, 0, 1, 1, apply_increments); + } + return model; } } // namespace snippets } // namespace test diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp index 237e9b717273d4..6fa4648a5548a9 100644 --- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp +++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp @@ -47,17 +47,19 @@ std::shared_ptr AddSinhFunction::initReference() const { return std::make_shared(NodeVector{add}, ParameterVector{data0, data1}); } std::shared_ptr AddSinhConstFunction::initOriginal() const { + Shape static_input_shape = input_shapes[0].get_shape(); auto data0 = std::make_shared(precision, input_shapes[0]); - const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.); - auto const_data1 = std::make_shared(precision, input_shapes[0], const_values); + const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(static_input_shape), -10., 10.); + auto const_data1 = std::make_shared(precision, static_input_shape, const_values); auto sin0 = std::make_shared(data0); auto add = std::make_shared(sin0, const_data1); return std::make_shared(NodeVector{add}, ParameterVector{data0}); } std::shared_ptr AddRollConstFunction::initOriginal() const { - auto data0 = std::make_shared(precision, input_shapes[0]); - const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.); - auto const_data1 = std::make_shared(precision, input_shapes[0], const_values); + const auto input_shape = input_shapes[0].get_shape(); + auto data0 = std::make_shared(precision, input_shape); + const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shape), -10., 10.); + auto const_data1 = std::make_shared(precision, input_shape, const_values); auto shift = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{1}); auto axes = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{0}); auto roll0 = std::make_shared(data0, shift, axes); @@ -70,7 +72,7 @@ std::shared_ptr AddRollConstFunction::initOriginal() const { std::shared_ptr EltwiseFunction::initOriginal() const { auto data0 = std::make_shared(precision, input_shapes[0]); auto data1 = std::make_shared(precision, input_shapes[1]); - const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[1]), -10., 10.); + const std::vector const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.); auto const_data = std::make_shared(precision, data1->get_shape(), const_values); auto add = std::make_shared(data0, data1); auto sub = std::make_shared(add, const_data); @@ -80,7 +82,7 @@ std::shared_ptr EltwiseFunction::initOriginal() const { std::shared_ptr EltwiseFunction::initReference() const { auto data0 = std::make_shared(precision, input_shapes[0]); auto data1 = std::make_shared(precision, input_shapes[1]); - const std::vector const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[1]), -10., 10.); + const std::vector const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.); auto const_data = std::make_shared(precision, data1->get_shape(), const_values); auto indata0 = std::make_shared(precision, data0->get_shape()); auto indata1 = std::make_shared(precision, data1->get_shape()); @@ -177,8 +179,8 @@ std::shared_ptr MatMulEltwiseBranchesFunction::initReference() const auto sub_const_2 = std::make_shared(precision, Shape{1}, const_values[3]); // snippet function - Shape matMulOutShape = input_shapes[0]; - matMulOutShape.back() = input_shapes[1].back(); + Shape matMulOutShape = input_shapes[0].get_shape(); + matMulOutShape.back() = input_shapes[1].get_shape().back(); auto snippet_input = std::make_shared(precision, matMulOutShape); auto mul_1 = std::make_shared(snippet_input, mul_const_1);