diff --git a/src/common/snippets/include/snippets/generator.hpp b/src/common/snippets/include/snippets/generator.hpp
index 92b321e88e8b68..f21a6951fedd62 100644
--- a/src/common/snippets/include/snippets/generator.hpp
+++ b/src/common/snippets/include/snippets/generator.hpp
@@ -84,7 +84,7 @@ class Schedule {
      * @param f can this kernel be linearided to 1D range
      * @param p pointer to generated code
      */
-    Schedule(const Shape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {}
+    Schedule(const ov::PartialShape& ws, bool f, code p) : work_size(ws), is_flat(f), ptr(p) {}
     /**
      * @brief Returns callable instanse of code pointer
      */
@@ -92,7 +92,7 @@ class Schedule {
         return reinterpret_cast<K>(const_cast<unsigned char*>(ptr));
     }
 
-    Shape work_size {};
+    ov::PartialShape work_size {};
     bool is_flat {false};
     code ptr {nullptr};
 };
@@ -123,7 +123,7 @@ class Generator {
      * @brief gets target machine
      * @return pointer to constant target machine
      */
-    std::shared_ptr<const TargetMachine> get_target_machine() const { return target; }
+    std::shared_ptr<const TargetMachine> get_target_machine() const;
 
 protected:
     std::shared_ptr<TargetMachine> target;
diff --git a/src/common/snippets/include/snippets/op/broadcastload.hpp b/src/common/snippets/include/snippets/op/broadcastload.hpp
index 0d90fb15a84b97..851c0ca8c3ea7d 100644
--- a/src/common/snippets/include/snippets/op/broadcastload.hpp
+++ b/src/common/snippets/include/snippets/op/broadcastload.hpp
@@ -21,7 +21,7 @@ class BroadcastLoad : public BroadcastMove {
 public:
     OPENVINO_OP("BroadcastLoad", "SnippetsOpset", ngraph::snippets::op::BroadcastMove);
 
-    BroadcastLoad(const Output<Node>& x, Shape output_shape);
+    BroadcastLoad(const Output<Node>& x, ov::PartialShape output_shape);
     BroadcastLoad() = default;
 
     bool visit_attributes(AttributeVisitor& visitor) override;
@@ -29,17 +29,6 @@ class BroadcastLoad : public BroadcastMove {
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
 
     void validate_and_infer_types() override;
-
-    void set_broadcast_info(const Shape& bct) {
-        broadcast_info = bct;
-    }
-
-    bool is_broadcast(size_t idx) {
-        return broadcast_info[idx] == 1;
-    }
-
-private:
-    Shape broadcast_info;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/broadcastmove.hpp b/src/common/snippets/include/snippets/op/broadcastmove.hpp
index 0ab279f1ac814e..8514d61898b8df 100644
--- a/src/common/snippets/include/snippets/op/broadcastmove.hpp
+++ b/src/common/snippets/include/snippets/op/broadcastmove.hpp
@@ -19,7 +19,7 @@ class BroadcastMove : public ngraph::op::Op {
 public:
     OPENVINO_OP("BroadcastMove", "SnippetsOpset");
 
-    BroadcastMove(const Output<Node>& x, Shape output_shape);
+    BroadcastMove(const Output<Node>& x, ov::PartialShape output_shape);
     BroadcastMove() = default;
 
     bool visit_attributes(AttributeVisitor& visitor) override;
@@ -28,12 +28,9 @@ class BroadcastMove : public ngraph::op::Op {
 
     void validate_and_infer_types() override;
 
-    OPENVINO_SUPPRESS_DEPRECATED_START
-    bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
-    OPENVINO_SUPPRESS_DEPRECATED_END
 
 protected:
-    Shape output_shape;
+    ov::PartialShape output_shape;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/load.hpp b/src/common/snippets/include/snippets/op/load.hpp
index 7f53240ae21946..a263db7581987f 100644
--- a/src/common/snippets/include/snippets/op/load.hpp
+++ b/src/common/snippets/include/snippets/op/load.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <ngraph/op/op.hpp>
+#include "snippets/op/memory_access.hpp"
 
 namespace ngraph {
 namespace snippets {
@@ -17,29 +18,14 @@ namespace op {
  *        Default value is "1" - to load one element
  * @ingroup snippets
  */
-class Load : public ngraph::op::Op {
+class Load : public MemoryAccess {
 public:
     OPENVINO_OP("Load", "SnippetsOpset");
 
     Load(const Output<Node>& x, const size_t count = 1lu);
     Load() = default;
 
-    size_t get_count() const { return m_count; }
-
-    void set_count(const size_t count) { m_count = count; }
-
-    bool visit_attributes(AttributeVisitor& visitor) override;
-
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
-
-    void validate_and_infer_types() override;
-
-    OPENVINO_SUPPRESS_DEPRECATED_START
-    bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
-    OPENVINO_SUPPRESS_DEPRECATED_END
-
-protected:
-    size_t m_count = 0lu;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/loop.hpp b/src/common/snippets/include/snippets/op/loop.hpp
new file mode 100644
index 00000000000000..519cc53ddd3eaf
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/loop.hpp
@@ -0,0 +1,92 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+#include "snippets/emitter.hpp"
+#include "ngraph/op/parameter.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface LoopBase
+ * @brief Inserted during scheduling generation and represents Loop in affine notation
+ * @ingroup snippets
+ */
+class LoopBase : public ngraph::op::Op {
+public:
+    OPENVINO_OP("LoopBase", "SnippetsOpset");
+    LoopBase(const std::vector<Output<Node>>& args, size_t dimension, size_t work_amount, size_t increment);
+    LoopBase() = delete;
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    size_t get_work_amount() const;
+    size_t get_increment() const;
+    size_t get_dimension() const;
+    bool get_evaluate_once() const;
+
+protected:
+    size_t dimension;
+    size_t work_amount;
+    size_t increment;
+    bool evaluate_once; // true if the Loop is executed only once, used to skip setting and testing the loop counter
+};
+class LoopEnd;
+class LoopBegin : public LoopBase {
+    friend LoopEnd;
+public:
+    OPENVINO_OP("LoopBegin", "SnippetsOpset");
+    /// \brief Construct an Loop
+    /// \param region The vector of pairs: emitters and the corresponding registers
+    /// \param increment Loop size - count of elements to load and store.
+    ///                  Vector Loop should have size of vector register and Scalar Loop should have 1
+    /// \param num_inputs Count of inputs
+    /// \param num_outputs Count of outputs
+    /// \param io_dims Vector of last dimensions of inputs and outputs
+    /// \param io_data_sizes Vector of data type sizes of inputs and outputs
+    explicit LoopBegin(const std::vector<Output<Node>>& args);
+    LoopBegin() = delete;
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs)  const override;
+    std::shared_ptr<LoopEnd> get_loop_end();
+    // begin_address and input_regs are needed to communicate information between LoopBegin and LoopEnd emitters
+    const uint8_t* begin_address;
+    std::vector<size_t> input_regs;
+private:
+    void validate_and_infer_types_except_LoopEnd();
+    LoopBegin(const std::vector<Output<Node>>& args, size_t dimension, size_t work_amount, size_t increment);
+};
+
+class LoopEnd : public LoopBase {
+public:
+    OPENVINO_OP("LoopEnd", "SnippetsOpset");
+    LoopEnd(const std::vector<Output<Node>>& args, size_t dimension, size_t work_amount, size_t increment,
+              std::vector<bool> apply_increment, std::vector<int64_t> finalization_offsets);
+    LoopEnd() = delete;
+    std::shared_ptr<LoopBegin> get_loop_begin();
+    void validate_and_infer_types() override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs)  const override;
+    const std::vector<int64_t>& get_finalization_offsets() const;
+    const std::vector<bool>& get_apply_increment() const;
+    void set_finalization_offsets(std::vector<int64_t> offsets);
+    void set_apply_increment(std::vector<bool> apply_increment);
+    void set_work_amount(size_t new_work_amount);
+    void set_increment(size_t new_increment);
+    void set_evaluate_once(bool once);
+    // Used to propagate information about Loop structure, needed to simplify some optimizations. For example,
+    // to skip pointer increments when outer Loop is empty, and work_amount == vector_size (one inner vector Loop)
+    // true by default, the optimizations enabled if it's false;
+    bool has_outer_loop;
+
+private:
+    std::vector<bool> apply_increment;
+    std::vector<int64_t> finalization_offsets;
+    size_t loop_io_size;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/loop_helpers.hpp b/src/common/snippets/include/snippets/op/loop_helpers.hpp
new file mode 100644
index 00000000000000..57a14e5f036cc9
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/loop_helpers.hpp
@@ -0,0 +1,67 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ngraph/op/op.hpp"
+#include "ngraph/op/parameter.hpp"
+#include "loop.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/* ==== LoopBegin === */
+std::shared_ptr<LoopBegin> insertLoopBeginAfterOutputs(const OutputVector& originalOutputs);
+
+template<typename T>
+std::shared_ptr<LoopBegin> insertLoopBegin(const T& afterTheseNodes) {
+    static_assert(std::is_same<T, ParameterVector>() || std::is_same<T, NodeVector>(),
+                  "Unsupported template parameter for insertLoopBegin. Only ParameterVector or NodeVector is allowed");
+    OutputVector originalOutputs;
+    std::vector<std::set<Input<Node>>> childInputs;
+    for (const auto &n : afterTheseNodes) {
+        const auto& nodeOutputs = n->outputs();
+        // Ignore the LoopBegin->LoopEnd edge to make it easier to construct enclosed Loops
+        std::move(nodeOutputs.begin(), nodeOutputs.end() - 1 * ov::is_type<LoopBegin>(n), std::back_inserter(originalOutputs));
+    }
+
+    return insertLoopBeginAfterOutputs(originalOutputs);
+}
+
+template<>
+inline std::shared_ptr<LoopBegin> insertLoopBegin(const OutputVector& afterTheseNodes) {
+    return insertLoopBeginAfterOutputs(afterTheseNodes);
+}
+/* ============== */
+
+/* ==== LoopEnd === */
+std::shared_ptr<LoopEnd> insertLoopEndBeforeInputs(const std::vector<Input<Node>>& originalInputs,
+                                                  const std::shared_ptr<LoopBegin>& tileBegin,
+                                                  size_t dimension, size_t work_amount, size_t increment,
+                                                  std::vector<bool> apply_increment = {},
+                                                  std::vector<int64_t> finalization_offsets = {});
+
+template<typename T, typename ...Args>
+std::shared_ptr<LoopEnd> insertLoopEnd(const T& beforeTheseNodes, Args ...args) {
+    static_assert(std::is_same<T, ResultVector>() || std::is_same<T, NodeVector>(),
+                  "Unsupported template parameter for insertLoopBegin. Only ParameterVector or NodeVector is allowed");
+    std::vector<Input<Node>> originalInputs;
+    for (const auto &n : beforeTheseNodes) {
+        const auto& nodeInputs = n->inputs();
+        // Ignore the LoopBegin->LoopEnd edge to facilitate enclosed Loops construction
+        std::move(nodeInputs.begin(), nodeInputs.end() - 1 * ov::is_type<LoopEnd>(n), std::back_inserter(originalInputs));
+    }
+    return insertLoopEndBeforeInputs(originalInputs, args...);
+}
+
+template<typename ...Args>
+std::shared_ptr<LoopEnd> insertLoopEnd(const std::vector<Input<Node>>& beforeTheseNodes,  Args ...args) {
+    return insertLoopEndBeforeInputs(beforeTheseNodes, args...);
+}
+/* ============== */
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/memory_access.hpp b/src/common/snippets/include/snippets/op/memory_access.hpp
new file mode 100644
index 00000000000000..22aca3f358be4c
--- /dev/null
+++ b/src/common/snippets/include/snippets/op/memory_access.hpp
@@ -0,0 +1,38 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/op/op.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+/**
+ * @interface MemoryAccess
+ * @brief This is an ubre
+ *        where number of elements to store is determined by "count"
+ *        Default value is "1" - to store one element
+ * @ingroup snippets
+ */
+
+class MemoryAccess : public ngraph::op::Op {
+public:
+    OPENVINO_OP("MemoryAccess", "SnippetsOpset");
+
+    size_t get_count() const;
+    void set_count(size_t count);
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    void validate_and_infer_types() override;
+
+protected:
+    explicit MemoryAccess(const Output<Node>& x, size_t count = 1lu);
+    MemoryAccess() = default;
+    size_t m_count = 0lu;
+};
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/op/scalar.hpp b/src/common/snippets/include/snippets/op/scalar.hpp
index a8de072be50f10..1916f554a32c67 100644
--- a/src/common/snippets/include/snippets/op/scalar.hpp
+++ b/src/common/snippets/include/snippets/op/scalar.hpp
@@ -34,6 +34,7 @@ class Scalar  : public ov::op::v0::Constant {
 
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
     void validate_and_infer_types() override;
+    bool visit_attributes(AttributeVisitor& visitor) override;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/store.hpp b/src/common/snippets/include/snippets/op/store.hpp
index 0ff5cc3ec8e063..b83a4fdcec2b18 100644
--- a/src/common/snippets/include/snippets/op/store.hpp
+++ b/src/common/snippets/include/snippets/op/store.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include <ngraph/op/op.hpp>
+#include "snippets/op/memory_access.hpp"
 
 namespace ngraph {
 namespace snippets {
@@ -17,29 +18,14 @@ namespace op {
  *        Default value is "1" - to store one element
  * @ingroup snippets
  */
-class Store : public ngraph::op::Op {
+class Store : public MemoryAccess {
 public:
     OPENVINO_OP("Store", "SnippetsOpset");
 
     Store(const Output<Node>& x, const size_t count = 1lu);
     Store() = default;
 
-    size_t get_count() const { return m_count; }
-
-    void set_count(const size_t count) { m_count = count; }
-
-    bool visit_attributes(AttributeVisitor& visitor) override;
-
     std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
-
-    void validate_and_infer_types() override;
-
-    OPENVINO_SUPPRESS_DEPRECATED_START
-    bool evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const override;
-    OPENVINO_SUPPRESS_DEPRECATED_END
-
-protected:
-    size_t m_count = 0lu;
 };
 
 } // namespace op
diff --git a/src/common/snippets/include/snippets/op/subgraph.hpp b/src/common/snippets/include/snippets/op/subgraph.hpp
index 43c6376ad607c1..dfcde2bd4fd2c6 100644
--- a/src/common/snippets/include/snippets/op/subgraph.hpp
+++ b/src/common/snippets/include/snippets/op/subgraph.hpp
@@ -25,6 +25,7 @@ namespace op {
 class Subgraph : public ngraph::op::Op {
 public:
     OPENVINO_OP("Subgraph", "SnippetsOpset");
+    enum {DYNAMIC_DIMENSION = 0xffffffffffffffff};
 
     // < 1, 42, 17, 15, 16> < 0, 1, 2, 3, 1>
     // should be:
@@ -67,7 +68,7 @@ class Subgraph : public ngraph::op::Op {
     //
     // D = < 1, 3, 17, 15, 32> < 0, 1, 2, 3, 4>
     // E = < 1, 3, 17,  1, 32> < 0, 1, 2, 3, 4>
-    using BlockedShape = std::tuple<ngraph::Shape, ngraph::AxisVector, ngraph::element::Type>;
+    using BlockedShape = std::tuple<ngraph::PartialShape, ngraph::AxisVector, ngraph::element::Type>;
     using BlockedShapeVector = std::vector<BlockedShape>;
 
     Subgraph(const OutputVector& args, std::shared_ptr<ov::Model> body);
@@ -105,7 +106,10 @@ class Subgraph : public ngraph::op::Op {
     snippets::Schedule generate(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes, const void* compile_params = nullptr);
     snippets::Schedule generate(ngraph::pass::Manager &opt, const void* compile_params = nullptr);
     snippets::Schedule generate(const void* compile_params = nullptr);
-    Shape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
+    ov::PartialShape canonicalize(const BlockedShapeVector& output_shapes, const BlockedShapeVector& input_shapes);
+    ov::PartialShape get_master_shape();
+    std::vector<PartialShape> reshape_body(const std::vector<PartialShape>& input_shapes);
+    std::vector<Shape> reshape_body(const std::vector<Shape>& input_shapes);
 
     // plugin sets generator for a snippet to some specific generator.
     // it's going to be replaced with Jitters table later
@@ -116,6 +120,7 @@ class Subgraph : public ngraph::op::Op {
     void print_statistics(bool verbose);
 
     void serialize() const;
+    void set_master_shape(ov::PartialShape new_shape) {master_shape = std::move(new_shape);}
 
     static auto wrap_node_as_subgraph(const std::shared_ptr<ngraph::Node>& node) -> std::shared_ptr<Subgraph>;
     static void fill_empty_output_names(const Output<Node>& target_output_node, const Output<Node>& replacement_output_node);
@@ -146,6 +151,8 @@ class Subgraph : public ngraph::op::Op {
         // because TypeRelaxed::copy_with_new_inputs() isn't save-thread method
         bool m_has_type_relaxed_ops = false;
     } config;
+
+    ov::PartialShape master_shape;
 };
 
 static inline std::ostream& operator<<(std::ostream& os, const op::Subgraph::BlockedShape& blocked_shape) {
diff --git a/src/common/snippets/include/snippets/op/tile.hpp b/src/common/snippets/include/snippets/op/tile.hpp
deleted file mode 100644
index ac1d6ef4d1a2b9..00000000000000
--- a/src/common/snippets/include/snippets/op/tile.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "ngraph/op/op.hpp"
-#include "snippets/emitter.hpp"
-
-namespace ngraph {
-namespace snippets {
-namespace op {
-
-/**
- * @interface Tile
- * @brief Generated by Canonicalization and represents Loop in affine notation
- * @ingroup snippets
- */
-class Tile : public ngraph::op::Op {
-public:
-    OPENVINO_OP("Tile", "SnippetsOpset");
-
-    /// \brief Construct an Tile
-    /// \param region The vector of pairs: emitters and the corresponding registers
-    /// \param increment Tile size - count of elements to load and store.
-    ///                  Vector Tile should have size of vector register and Scalar Tile should have 1
-    /// \param num_inputs Count of inputs
-    /// \param num_outputs Count of outputs
-    /// \param io_dims Vector of last dimensions of inputs and outputs
-    /// \param io_data_sizes Vector of data type sizes of inputs and outputs
-    Tile(const std::vector<AllocatedEmitter>& region, size_t increment, size_t num_inputs, size_t num_outputs,
-         const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes);
-    Tile() = default;
-    std::vector<AllocatedEmitter> region;
-    size_t increment = 0;
-    size_t num_inputs = 0;
-    size_t num_outputs = 0;
-    std::vector<size_t> io_dims {};
-    std::vector<size_t> io_data_size {};
-
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
-        return std::make_shared<Tile>(region, increment, num_inputs, num_outputs, io_dims, io_data_size);
-    }
-};
-
-} // namespace op
-} // namespace snippets
-} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/include/snippets/op/tile_scheduler.hpp b/src/common/snippets/include/snippets/op/tile_scheduler.hpp
deleted file mode 100644
index 9d6010f77978b0..00000000000000
--- a/src/common/snippets/include/snippets/op/tile_scheduler.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (C) 2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#pragma once
-
-#include "ngraph/op/op.hpp"
-#include "snippets/emitter.hpp"
-#include "tile.hpp"
-
-namespace ngraph {
-namespace snippets {
-namespace op {
-
-/**
- * @interface TileScheduler
- * @brief Contains a set of Tiles (currently one vector and one scalar) and performs necessary preparations
- * before the Tiles could be executed: calculates offsets, sets proper work amounts, decrement pointers if the same data
- * have to be read several times (broadcasting).
- * @ingroup snippets
- */
-class TileScheduler : public ngraph::op::Op {
-public:
-    OPENVINO_OP("TileScheduler", "SnippetsOpset");
-
-    TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region);
-    TileScheduler() = default;
-    AllocatedEmitter vector_region;
-    AllocatedEmitter scalar_region;
-    // todo: this clone_with_new_inputs is irrelevant
-    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& inputs) const override {
-        return std::make_shared<TileScheduler>(vector_region, scalar_region);
-    }
-    const void *compile_params;
-};
-
-} // namespace op
-} // namespace snippets
-} // namespace ngraph
diff --git a/src/common/snippets/include/snippets/pass/insert_loops.hpp b/src/common/snippets/include/snippets/pass/insert_loops.hpp
new file mode 100644
index 00000000000000..874fc688e404a5
--- /dev/null
+++ b/src/common/snippets/include/snippets/pass/insert_loops.hpp
@@ -0,0 +1,32 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <ngraph/pattern/matcher.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace pass {
+
+/**
+ * @interface InsertLoops
+ * @brief Insert explicit Loop operations into the body to process multiple data entities during one kernel execution
+ * @ingroup snippets
+ */
+class InsertLoops: public ngraph::pass::FunctionPass {
+public:
+    OPENVINO_RTTI("InsertLoops", "0");
+    InsertLoops(ov::PartialShape master_shape, size_t vector_size);
+    bool run_on_model(const std::shared_ptr<ngraph::Function>& m) override;
+
+private:
+    ov::PartialShape master_shape;
+    size_t vector_size;
+};
+
+}  // namespace pass
+}  // namespace snippets
+}  // namespace ngraph
diff --git a/src/common/snippets/include/snippets/snippets_isa.hpp b/src/common/snippets/include/snippets/snippets_isa.hpp
index f1c0e9056d66eb..1137de1db0c76c 100644
--- a/src/common/snippets/include/snippets/snippets_isa.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa.hpp
@@ -17,8 +17,7 @@
 #include "op/scalar.hpp"
 #include "op/powerstatic.hpp"
 #include "op/store.hpp"
-#include "op/tile.hpp"
-#include "op/tile_scheduler.hpp"
+#include "op/loop.hpp"
 
 namespace ngraph {
 namespace snippets {
diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
index 0e85fe72861a21..d34b93392a09c8 100644
--- a/src/common/snippets/src/generator.cpp
+++ b/src/common/snippets/src/generator.cpp
@@ -6,34 +6,35 @@
 #include "snippets/pass/assign_registers.hpp"
 #include "snippets/pass/vector_to_scalar.hpp"
 #include "snippets/pass/insert_load_store.hpp"
-#include "snippets/op/tile.hpp"
+#include "snippets/op/loop.hpp"
+#include "snippets/op/subgraph.hpp"
 #include "snippets/op/kernel.hpp"
 #include <snippets/itt.hpp>
 
 #include <ngraph/pass/manager.hpp>
+#include <openvino/core/type.hpp>
 
-auto ngraph::snippets::getRegisters(std::shared_ptr<ngraph::Node>& n) -> ngraph::snippets::RegInfo {
+namespace ngraph {
+namespace snippets {
+
+auto getRegisters(const std::shared_ptr<ngraph::Node> &n) -> RegInfo {
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::getRegisters")
-    auto rt = n->get_rt_info();
 
     // ToDo: change to reg_t
     std::vector<size_t> rin, rout;
 
-    auto it_rt = rt.find("reginfo");
-    if (it_rt != rt.end()) {
-        for (auto reg : it_rt->second.as<std::vector<size_t>>()) {
-            rout.push_back(reg);
-        }
+    for (const auto& output : n->outputs()) {
+        const auto& rt = output.get_tensor_ptr()->get_rt_info();
+        auto it_rt = rt.find("reginfo");
+        if (it_rt != rt.end())
+            rout.push_back(it_rt->second.as<size_t>());
     }
 
     for (const auto& input : n->inputs()) {
-        auto rt = input.get_source_output().get_node_shared_ptr()->get_rt_info();
+        auto rt = input.get_source_output().get_tensor_ptr()->get_rt_info();
         auto it_rt = rt.find("reginfo");
-        if (it_rt != rt.end()) {
-            for (auto& reg : it_rt->second.as<std::vector<size_t>>()) {
-                rin.push_back(reg);
-            }
-        }
+        if (it_rt != rt.end())
+            rin.push_back(it_rt->second.as<size_t>());
     }
     return std::make_pair(rin, rout);
 }
@@ -42,70 +43,143 @@ ngraph::snippets::code ngraph::snippets::Generator::generate(std::shared_ptr<ov:
                                                              const void* compile_params) const {
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator::generate")
     if (!target->is_supported())
-        throw ngraph_error("unsupported architecture for code genration");
+        throw ngraph_error("unsupported architecture for code generation");
 
     auto params = m->get_parameters();
     auto results = m->get_results();
     auto in = params.size();
     auto out = results.size();
+
     std::vector<size_t> io_last_dims(in + out);
     std::vector<size_t> io_data_sizes(in + out);
     std::transform(params.begin(), params.end(), io_last_dims.begin(),
-                   [](const std::shared_ptr<Node>& n){return n->get_output_shape(0).back();});
+                   [](const std::shared_ptr<Node>& n){
+                       auto last_dim = n->get_output_partial_shape(0).rbegin();
+                       return last_dim->is_dynamic() ? op::Subgraph::DYNAMIC_DIMENSION
+                                                     : last_dim->get_length();
+                   });
     std::transform(results.begin(), results.end(), io_last_dims.begin() + in,
-                   [](const std::shared_ptr<Node>& n){return n->get_input_shape(0).back();});
+                   [](const std::shared_ptr<Node> &n) {
+                       auto last_dim = n->get_input_partial_shape(0).rbegin();
+                       return last_dim->is_dynamic() ? op::Subgraph::DYNAMIC_DIMENSION
+                                                     : last_dim->get_length();
+                   });
     std::transform(params.begin(), params.end(), io_data_sizes.begin(),
                    [](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
     std::transform(results.begin(), results.end(), io_data_sizes.begin() + in,
                    [](const std::shared_ptr<Node>& n){return n->get_element_type().size();});
 
     OV_ITT_TASK_CHAIN(GENERATE, ngraph::pass::itt::domains::SnippetsTransform, "Snippets::Generator", "::VectorTile")
-    // vector tile
+    // vector loop
     std::vector<AllocatedEmitter> lowered;
-    for (auto n : m->get_ordered_ops()) {
-        lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
-    }
-    OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile")
-
-    // scalar tile
-    auto m_scalar = ov::clone_model(*m.get());
-    ngraph::pass::Manager mng;
-    mng.register_pass<ngraph::snippets::pass::SetScalarCountForLoad>();
-    mng.register_pass<ngraph::snippets::pass::SetScalarCountForStore>();
-    mng.run_passes(m_scalar);
-    OV_ITT_TASK_NEXT(GENERATE, "::ScalarTile_get")
-    std::vector<AllocatedEmitter> scalar_lowered;
-    for (auto n : m_scalar->get_ordered_ops()) {
-        scalar_lowered.emplace_back(std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n)));
+    auto lower_ops = [&lowered, this](const NodeVector& ops){
+        std::transform(ops.begin(), ops.end(), std::back_inserter(lowered),
+                       [this](const std::shared_ptr<Node>& n){
+                           return std::make_pair(target->get(n->get_type_info())(n), ngraph::snippets::getRegisters(n));
+                       });
+    };
+    // *1* solo vector/tail loop + empty outer loop
+    //      => skip increments (both counter & ptr) : set evaluate_once flag
+    // *2* solo vector/tail loop + non-empty outer loop
+    //      => skip counter increments but perform ptr increments : set evaluate_once,
+    //         and perform pointer increments through finalization offsets
+    // *3* vector loop(s) + one tail loop
+    //      => vector as usual, tail depends on outer loop, see *1* and *2*
+    auto optimize_single_evaluation = [](const std::shared_ptr<op::LoopEnd>& loop, bool force_ptr_increment = false) {
+        if (loop->get_work_amount() < 2 * loop->get_increment()) {
+            loop->set_evaluate_once(true);
+            if (force_ptr_increment || loop->has_outer_loop) {
+                const auto increment = loop->get_increment();
+                std::vector<int64_t> new_finalization_offsets(loop->get_finalization_offsets());
+                const auto& apply_increments = loop->get_apply_increment();
+                for (auto i = 0; i < new_finalization_offsets.size(); i++) {
+                    new_finalization_offsets[i] += increment * apply_increments[i];
+                }
+                loop->set_finalization_offsets(new_finalization_offsets);
+            }
+            return true;
+        } else {
+            return false;
+        }
+    };
+    const auto& ops = m->get_ordered_ops();
+    for (auto op = ops.begin(); op < ops.end(); op++) {
+        const auto& loop_begin = ov::as_type_ptr<ngraph::snippets::op::LoopBegin>(*op);
+        // ignore outer loops and possible manual tail loops
+        if (loop_begin && loop_begin->get_increment() != 1) {
+            NodeVector vector_loop, tail_loop;
+            std::shared_ptr<op::LoopEnd> vector_loop_end, tail_loop_end;
+            vector_loop_end = loop_begin->get_loop_end();
+            tail_loop_end = nullptr;
+            while (*op != vector_loop_end)
+                vector_loop.push_back(*op++);
+            vector_loop.push_back(*op);
+            const auto work_amount = vector_loop_end->get_work_amount();
+            const auto increment = vector_loop_end->get_increment();
+            const auto tail_size = work_amount % increment;
+            const auto need_tail = tail_size != 0;
+            const auto need_vector_loop = work_amount >= increment;
+            // Note, that finalization_offsets could be modified inside optimize_single_evaluation,
+            // so need to save them here to cover (evaluate_once vector with non-zero finalization_offsets + tail)
+            std::vector<int64_t> tail_finalization_offsets = need_tail ? vector_loop_end->get_finalization_offsets() : std::vector<int64_t> {};
+            // vector loops are required => Just copy the body, original loop is already a vector one
+            if (need_vector_loop) {
+                // Note that finalization offsets should be applied after the last iteration.
+                // So if there is a tail, then we should apply offsets after it, but not now.
+                if (need_tail)
+                    vector_loop_end->set_finalization_offsets(std::vector<int64_t>(tail_finalization_offsets.size(), 0));
+                // force ptr increments if there is tail
+                optimize_single_evaluation(vector_loop_end, need_tail);
+                lower_ops(vector_loop);
+            }
+            OV_ITT_TASK_NEXT(GENERATE, "::TailLoop")
+            // tail is required => transform the body into a tail representation
+            // tail loop is fake loop because for tail we should calculate only
+            // finalization offsets which are supported by LoopEnd.
+            if (need_tail) {
+                NodeMap vector_to_tail_node_map;
+                tail_loop = ngraph::clone_nodes(vector_loop,  vector_to_tail_node_map);
+                std::transform(tail_loop.begin(), tail_loop.end(), tail_loop.begin(),
+                               [tail_size](const std::shared_ptr<Node>& n){
+                                   const auto& memory_access = std::dynamic_pointer_cast<ngraph::snippets::op::MemoryAccess>(n);
+                                   if (memory_access && memory_access->get_count() != 1) {
+                                       memory_access->set_count(tail_size);
+                                   }
+                                   return n;
+                               });
+                tail_loop_end = ov::as_type_ptr<op::LoopEnd>(*tail_loop.rbegin());
+                tail_loop_end->set_finalization_offsets(tail_finalization_offsets);
+                tail_loop_end->set_increment(tail_size);
+                tail_loop_end->set_work_amount(tail_size);
+                tail_loop_end->has_outer_loop = vector_loop_end->has_outer_loop;
+                // tail loop is always executed once
+                optimize_single_evaluation(tail_loop_end);
+                lower_ops(tail_loop);
+            }
+        } else {
+            lower_ops({*op});
+        }
     }
-    OV_ITT_TASK_NEXT(GENERATE, "::Tiles1D");
-    // wrapping into tiles1D
-    //todo: in, out, and io_last_dims should derive naturally from the graph representation
-    const auto& vector_tile = std::make_shared<ngraph::snippets::op::Tile>(lowered, target->get_lanes(), in, out, io_last_dims, io_data_sizes);
-    const auto& vector_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(vector_tile),
-                                   std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
-    const auto& scalar_tile = std::make_shared<ngraph::snippets::op::Tile>(scalar_lowered, 1, in, out, io_last_dims, io_data_sizes);
-    const auto& scalar_region = std::make_pair(target->get(ngraph::snippets::op::Tile::get_type_info_static())(scalar_tile),
-                    std::make_pair(std::vector<size_t>{}, std::vector<size_t>{}));
-
-    OV_ITT_TASK_NEXT(GENERATE, "::Tiles2D")
-    // wrapping into tiles2D
-    auto tile_scheduler = std::make_shared<ngraph::snippets::op::TileScheduler>(vector_region, scalar_region);
-    tile_scheduler->compile_params = compile_params;
-    const auto& tile_scheduler_region = std::make_pair(target->get(ngraph::snippets::op::TileScheduler::get_type_info_static())(tile_scheduler),
-                                                       std::make_pair(std::vector<size_t>({in, out, target->get_lanes()}), std::vector<size_t>{}));
 
     OV_ITT_TASK_NEXT(GENERATE, "::EmitCode")
     // emission
-    auto tiles2DKernel = std::make_shared<ngraph::snippets::op::Kernel>(std::vector<AllocatedEmitter> {tile_scheduler_region});
-    tiles2DKernel->compile_params = compile_params;
-    std::shared_ptr<Emitter> kernel = target->get(ngraph::snippets::op::Kernel::get_type_info_static())(tiles2DKernel);
+    auto loops2DKernel = std::make_shared<op::Kernel>(std::vector<AllocatedEmitter>{lowered});
+    loops2DKernel->compile_params = compile_params;
+    std::shared_ptr<Emitter> kernel = target->get(op::Kernel::get_type_info_static())(loops2DKernel);
+
     kernel->emit_code({in, out}, {});
+
     OV_ITT_TASK_NEXT(GENERATE, "::EmitData")
-    lowered.insert(lowered.end(), scalar_lowered.begin(), scalar_lowered.end());
     for (auto& op : lowered) {
         op.first->emit_data();
     }
     OV_ITT_TASK_NEXT(GENERATE, "::GetSnippet")
     return target->get_snippet();
 }
+
+std::shared_ptr<const TargetMachine> Generator::get_target_machine() const {
+    return target;
+}
+
+}// namespace snippets
+}// namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/op/broadcastload.cpp b/src/common/snippets/src/op/broadcastload.cpp
index 893cae32831c51..04ba89f48775e4 100644
--- a/src/common/snippets/src/op/broadcastload.cpp
+++ b/src/common/snippets/src/op/broadcastload.cpp
@@ -11,8 +11,7 @@
 using namespace std;
 using namespace ngraph;
 
-snippets::op::BroadcastLoad::BroadcastLoad(const Output<Node>& x, Shape shape)
-: BroadcastMove(x, shape), broadcast_info(x.get_shape().size(), 0) {
+snippets::op::BroadcastLoad::BroadcastLoad(const Output<Node>& x, ov::PartialShape shape) : BroadcastMove(x, std::move(shape)) {
     constructor_validate_and_infer_types();
 }
 
@@ -23,9 +22,7 @@ bool snippets::op::BroadcastLoad::visit_attributes(AttributeVisitor& visitor) {
 std::shared_ptr<Node> snippets::op::BroadcastLoad::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(BroadcastLoad);
     check_new_args_count(this, new_args);
-    auto other = std::make_shared<BroadcastLoad>(new_args.at(0), output_shape);
-    other->set_broadcast_info(this->broadcast_info);
-    return other;
+    return std::make_shared<BroadcastLoad>(new_args.at(0), output_shape);
 }
 
 void snippets::op::BroadcastLoad::validate_and_infer_types() {
diff --git a/src/common/snippets/src/op/broadcastmove.cpp b/src/common/snippets/src/op/broadcastmove.cpp
index 089cd8f2abd70b..e8c733a44d6345 100644
--- a/src/common/snippets/src/op/broadcastmove.cpp
+++ b/src/common/snippets/src/op/broadcastmove.cpp
@@ -12,7 +12,7 @@
 using namespace std;
 using namespace ngraph;
 
-snippets::op::BroadcastMove::BroadcastMove(const Output<Node>& x, Shape shape) : Op({x}), output_shape(shape) {
+snippets::op::BroadcastMove::BroadcastMove(const Output<Node>& x, ov::PartialShape shape) : Op({x}), output_shape(std::move(shape)) {
     constructor_validate_and_infer_types();
 }
 
@@ -23,44 +23,9 @@ bool snippets::op::BroadcastMove::visit_attributes(AttributeVisitor& visitor) {
 std::shared_ptr<Node> snippets::op::BroadcastMove::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(BroadcastMove);
     check_new_args_count(this, new_args);
-    auto other = std::make_shared<BroadcastMove>(new_args.at(0), this->output_shape);
-    return other;
+    return std::make_shared<BroadcastMove>(new_args.at(0), output_shape);
 }
 
 void snippets::op::BroadcastMove::validate_and_infer_types() {
     set_output_type(0, get_input_element_type(0), this->output_shape);
-}
-
-bool snippets::op::BroadcastMove::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const {
-    INTERNAL_OP_SCOPE(BroadcastMove);
-    NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config");
-    NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config");
-    NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation");
-    NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port");
-    NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
-
-    auto ishape = input_values[0]->get_shape();
-    auto oshape = output_values[0]->get_shape();
-
-    NGRAPH_CHECK(ishape.size() == oshape.size(), "input and output should have the same rank");
-
-    AxisSet broadcast_axes;
-    for (size_t k = 0; k < ishape.size(); k++) {
-        if (!((ishape[k] == oshape[k])
-           || (ishape[k] != oshape[k] && ((ishape[k] == 1) != (oshape[k] == 1) ) ))) {
-            throw ngraph_error("FakeBroadcast::evaluate incompatible shapes");
-        }
-
-        if (ishape[k] != oshape[k]) {
-            broadcast_axes.insert(k);
-        }
-    }
-
-    runtime::reference::broadcast(input_values[0]->get_data_ptr<char>(),
-                                  output_values[0]->get_data_ptr<char>(),
-                                  input_values[0]->get_shape(),
-                                  output_values[0]->get_shape(),
-                                  broadcast_axes,
-                                  sizeof(float));
-    return true;
 }
\ No newline at end of file
diff --git a/src/common/snippets/src/op/load.cpp b/src/common/snippets/src/op/load.cpp
index 1ac4df725fe75d..f7da7c16b1b411 100644
--- a/src/common/snippets/src/op/load.cpp
+++ b/src/common/snippets/src/op/load.cpp
@@ -8,39 +8,20 @@
 
 #include <ngraph/runtime/host_tensor.hpp>
 
-using namespace std;
-using namespace ngraph;
+namespace ngraph {
+namespace snippets {
+namespace op {
 
-snippets::op::Load::Load(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
+Load::Load(const Output<Node>& x, const size_t count) : MemoryAccess({x}, count) {
     constructor_validate_and_infer_types();
 }
 
-bool snippets::op::Load::visit_attributes(AttributeVisitor& visitor) {
-    return true;
-}
-
-std::shared_ptr<Node> snippets::op::Load::clone_with_new_inputs(const OutputVector& new_args) const {
+std::shared_ptr<Node> Load::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(Load);
     check_new_args_count(this, new_args);
     return std::make_shared<Load>(new_args.at(0), m_count);
 }
 
-void snippets::op::Load::validate_and_infer_types() {
-    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
-}
-
-bool snippets::op::Load::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const {
-    INTERNAL_OP_SCOPE(Load);
-    NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config");
-    NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config");
-    NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation");
-    NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port");
-    NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
-    NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
-
-    std::copy(input_values[0]->get_data_ptr<uint8_t>(),
-        input_values[0]->get_data_ptr<uint8_t>() + shape_size(get_output_shape(0))*output_values[0]->get_element_type().size(),
-        output_values[0]->get_data_ptr<uint8_t>());
-
-    return true;
-}
+}// namespace op
+}// namespace snippets
+}// namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/op/loop.cpp b/src/common/snippets/src/op/loop.cpp
new file mode 100644
index 00000000000000..c36b713b8f0496
--- /dev/null
+++ b/src/common/snippets/src/op/loop.cpp
@@ -0,0 +1,169 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "snippets/op/loop.hpp"
+#include "snippets/generator.hpp"
+
+using namespace std;
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+LoopBase::LoopBase(const std::vector<Output<Node>> &args, size_t dimension, size_t work_amount, size_t increment)
+        : Op(args), dimension(dimension), work_amount(work_amount), increment(increment), evaluate_once(false) {
+}
+
+bool LoopBase::visit_attributes(AttributeVisitor &visitor) {
+    visitor.on_attribute("dimension", dimension);
+    visitor.on_attribute("work_amount", work_amount);
+    visitor.on_attribute("increment", increment);
+    return true;
+}
+
+size_t LoopBase::get_work_amount() const {
+    return work_amount;
+}
+
+bool LoopBase::get_evaluate_once() const {
+    return evaluate_once;
+}
+
+size_t LoopBase::get_increment() const {
+    return increment;
+}
+
+size_t LoopBase::get_dimension() const {
+    return dimension;
+}
+
+LoopBegin::LoopBegin(const std::vector<Output<Node>> &args, size_t dimension, size_t work_amount, size_t increment)
+        : LoopBase(args, dimension, work_amount, increment),
+        begin_address(nullptr), input_regs({}) {
+    // We can only call a reduced validate_and_infer types from the constructor, since LoopEnd might not be attached
+    // to the LoopBegin at this point (which is usually the case: create LoopBegin first => then attach LoopEnd to it)
+    validate_and_infer_types_except_LoopEnd();
+}
+
+LoopBegin::LoopBegin(const std::vector<Output<Node>> &args)
+        : LoopBase(args, 0, 0, 0), begin_address(nullptr), input_regs({}) {
+    validate_and_infer_types_except_LoopEnd();
+}
+
+std::shared_ptr<Node> LoopBegin::clone_with_new_inputs(const OutputVector& inputs) const {
+    return std::shared_ptr<LoopBegin>(new LoopBegin(inputs, dimension, work_amount, increment));
+}
+
+
+void LoopBegin::validate_and_infer_types_except_LoopEnd() {
+    const size_t num_inputs = get_input_size();
+    set_output_size(num_inputs + 1);
+    // All outputs are by-passed from inputs, except for the last one - it connects LoopBegin and LoopEnd
+    for (int i = 0; i < num_inputs; i++)
+        get_output_descriptor(i).set_tensor_ptr(get_input_descriptor(i).get_output().get_tensor_ptr());
+    set_output_type(num_inputs, element::f32, ov::PartialShape{ov::Shape{}});
+}
+
+void LoopBegin::validate_and_infer_types() {
+    validate_and_infer_types_except_LoopEnd();
+    const auto& last_output_inputs = output(get_output_size() - 1).get_target_inputs();
+    NODE_VALIDATION_CHECK(this, last_output_inputs.size() == 1, "LoopBegin must have exactly one input attached to the last output");
+    const auto& loop_end = ov::as_type_ptr<LoopEnd>(last_output_inputs.begin()->get_node()->shared_from_this());
+    NODE_VALIDATION_CHECK(this, loop_end != nullptr, "LoopBegin must have LoopEnd connected to its last output");
+    dimension = loop_end->get_dimension();
+    work_amount = loop_end->get_work_amount();
+    increment = loop_end->get_increment();
+}
+
+std::shared_ptr<LoopEnd> LoopBegin::get_loop_end() {
+    const auto& last_output_inputs = output(get_output_size() - 1).get_target_inputs();
+    if (last_output_inputs.size() != 1)
+        throw std::invalid_argument("LoopBegin has more than one inputs attached to the last output");
+    const auto& loop_end = ov::as_type_ptr<LoopEnd>(last_output_inputs.begin()->get_node()->shared_from_this());
+    if (!loop_end)
+        throw std::invalid_argument("LoopBegin last output is not connected to LoopEnd");
+    return  loop_end;
+}
+
+LoopEnd::LoopEnd(const std::vector<Output<Node>> &args, size_t dimension, size_t work_amount, size_t increment,
+                 std::vector<bool> apply_increment, std::vector<int64_t> finalization_offsets)
+        : LoopBase(args, dimension, work_amount, increment), apply_increment(std::move(apply_increment)),
+        finalization_offsets(std::move(finalization_offsets)), has_outer_loop(true) {
+    constructor_validate_and_infer_types();
+}
+
+std::shared_ptr<Node> LoopEnd::clone_with_new_inputs(const OutputVector& inputs) const {
+    return std::make_shared<LoopEnd>(inputs, dimension, work_amount, increment, apply_increment, finalization_offsets);
+}
+
+std::shared_ptr<LoopBegin> LoopEnd::get_loop_begin() {
+    const auto& loop_begin = ov::as_type_ptr<LoopBegin>(get_input_source_output(get_input_size() - 1).get_node_shared_ptr());
+    if (!loop_begin)
+        throw std::invalid_argument("LoopEnd last input is not connected to LoopBegin");
+    return  loop_begin;
+}
+
+const std::vector<int64_t>& LoopEnd::get_finalization_offsets() const {
+    return finalization_offsets;
+}
+
+const std::vector<bool>& LoopEnd::get_apply_increment() const {
+    return apply_increment;
+}
+
+void LoopEnd::set_finalization_offsets(std::vector<int64_t> offsets) {
+    if (offsets.size() != loop_io_size)
+        throw std::invalid_argument("LoopEnd set_finalization_offsets is called with inconsistent offsets.size()");
+    finalization_offsets = std::move(offsets);
+}
+
+void LoopEnd::set_apply_increment(std::vector<bool> allow_increment) {
+    if (allow_increment.size() != loop_io_size)
+        throw std::invalid_argument("LoopEnd set_apply_increment is called with inconsistent apply_increment.size()");
+    apply_increment = std::move(allow_increment);
+}
+
+void LoopEnd::set_work_amount(size_t new_work_amount) {
+    work_amount = new_work_amount;
+    // Update LoopBegin to maintain consistency between the Loops
+    get_loop_begin()->work_amount = new_work_amount;
+}
+
+void LoopEnd::set_increment(size_t new_increment) {
+    increment = new_increment;
+    // Update LoopBegin to maintain consistency between the Loops
+    get_loop_begin()->increment = new_increment;
+}
+
+void LoopEnd::set_evaluate_once(bool once) {
+    evaluate_once = once;
+    // Update LoopBegin to maintain consistency between the Loops
+    get_loop_begin()->evaluate_once = once;
+}
+
+void LoopEnd::validate_and_infer_types() {
+    const size_t num_inputs = get_input_size();
+    const auto loop_begin = ov::as_type_ptr<LoopBegin>(input(get_input_size() - 1).get_source_output().get_node_shared_ptr());
+    NODE_VALIDATION_CHECK(this, loop_begin != nullptr, "LoopEnd must have LoopBegin as the last argument");
+    // Note: have to -2 because the LoopBegin->LoopEnd edge is counted twice
+    loop_io_size = get_input_size() + loop_begin->get_output_size() - 2;
+    NODE_VALIDATION_CHECK(this, apply_increment.empty() || apply_increment.size() == loop_io_size,
+                          "apply_increments must be either empty or defined per every input & output of joined Loop. Expected size: ",
+                          loop_io_size, " got ", apply_increment.size());
+    NODE_VALIDATION_CHECK(this, finalization_offsets.empty() || finalization_offsets.size() == loop_io_size,
+                          "finalization_offsets must be either empty or defined per every input & output of joined Loop. Expected size: ",
+                          loop_io_size, " got ", finalization_offsets.size());
+    if (apply_increment.empty())
+        apply_increment.resize(loop_io_size, true);
+    if (finalization_offsets.empty())
+        finalization_offsets.resize(loop_io_size, 0);
+    set_output_size(num_inputs - 1);
+    const auto& ins = inputs();
+    // All outputs are by-passed from inputs, except for the last one - it connects LoopBegin and LoopEnd
+    for (int i = 0; i < num_inputs - 1; i++)
+        get_output_descriptor(i).set_tensor_ptr(get_input_descriptor(i).get_output().get_tensor_ptr());
+}
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/op/loop_helpers.cpp b/src/common/snippets/src/op/loop_helpers.cpp
new file mode 100644
index 00000000000000..5882e305087eac
--- /dev/null
+++ b/src/common/snippets/src/op/loop_helpers.cpp
@@ -0,0 +1,48 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "ngraph/op/op.hpp"
+#include "snippets/op/loop_helpers.hpp"
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+std::shared_ptr<LoopBegin> insertLoopBeginAfterOutputs(const OutputVector& originalOutputs) {
+    std::vector<std::set<Input<Node>>> originalChildInputs;
+    for (const auto& out : originalOutputs) {
+        originalChildInputs.push_back(out.get_target_inputs());
+    }
+
+    auto loop_begin = std::make_shared<LoopBegin>(originalOutputs);
+
+    for (int i = 0; i < originalChildInputs.size(); i++) {
+        for (auto& input : originalChildInputs[i]) {
+            input.replace_source_output(loop_begin->output(i));
+        }
+    }
+    return loop_begin;
+}
+
+std::shared_ptr<LoopEnd> insertLoopEndBeforeInputs(const std::vector<Input<Node>>& originalInputs,
+                                                   const std::shared_ptr<LoopBegin>& loopBegin,
+                                                   size_t dimension, size_t work_amount, size_t increment,
+                                                   std::vector<bool> apply_increment,
+                                                   std::vector<int64_t> finalization_offsets) {
+    OutputVector originalParentOutputs;
+    for (const auto& in : originalInputs) {
+        originalParentOutputs.push_back(in.get_source_output());
+    }
+    originalParentOutputs.push_back(loopBegin->output(loopBegin->get_output_size() - 1));
+    auto loop_end = std::make_shared<LoopEnd>(originalParentOutputs, dimension, work_amount, increment,
+                                             std::move(apply_increment), std::move(finalization_offsets));
+
+    for (int i = 0; i < originalInputs.size(); i++) {
+        originalInputs[i].replace_source_output(loop_end->output(i));
+    }
+    return loop_end;
+}
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/op/memory_access.cpp b/src/common/snippets/src/op/memory_access.cpp
new file mode 100644
index 00000000000000..79f6b63a4be691
--- /dev/null
+++ b/src/common/snippets/src/op/memory_access.cpp
@@ -0,0 +1,37 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+
+#include "snippets/op/memory_access.hpp"
+
+#include <ngraph/runtime/host_tensor.hpp>
+
+namespace ngraph {
+namespace snippets {
+namespace op {
+
+MemoryAccess::MemoryAccess(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
+}
+
+bool MemoryAccess::visit_attributes(AttributeVisitor& visitor) {
+    visitor.on_attribute("count", m_count);
+    return true;
+}
+
+size_t MemoryAccess::get_count() const {
+    return m_count;
+}
+
+void MemoryAccess::set_count(const size_t count) {
+    m_count = count;
+}
+
+void MemoryAccess::validate_and_infer_types() {
+    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
+}
+
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/op/scalar.cpp b/src/common/snippets/src/op/scalar.cpp
index c788c341a3e02f..7a369ee1d163c2 100644
--- a/src/common/snippets/src/op/scalar.cpp
+++ b/src/common/snippets/src/op/scalar.cpp
@@ -19,4 +19,14 @@ void snippets::op::Scalar::validate_and_infer_types() {
     NODE_VALIDATION_CHECK(this, out_pshape.get_shape().empty() || ov::shape_size(out_pshape.get_shape()) == 1,
                       "Scalar supports only one-element constants, got ", out_pshape.get_shape(),
                       " shape");
+}
+
+bool snippets::op::Scalar::visit_attributes(AttributeVisitor& visitor) {
+    auto shape = get_output_shape(0);
+    auto type = get_output_element_type(0);
+    auto value = cast_vector<float>();
+    visitor.on_attribute("element_type", type);
+    visitor.on_attribute("shape", shape);
+    visitor.on_attribute("value", value);
+    return true;
 }
\ No newline at end of file
diff --git a/src/common/snippets/src/op/store.cpp b/src/common/snippets/src/op/store.cpp
index db3204df69ab0b..69e1e1643b769b 100644
--- a/src/common/snippets/src/op/store.cpp
+++ b/src/common/snippets/src/op/store.cpp
@@ -8,39 +8,20 @@
 
 #include <ngraph/runtime/host_tensor.hpp>
 
-using namespace std;
-using namespace ngraph;
+namespace ngraph {
+namespace snippets {
+namespace op {
 
-snippets::op::Store::Store(const Output<Node>& x, const size_t count) : Op({x}), m_count(count) {
+Store::Store(const Output<Node>& x, const size_t count) : MemoryAccess({x}, count) {
     constructor_validate_and_infer_types();
 }
 
-bool snippets::op::Store::visit_attributes(AttributeVisitor& visitor) {
-    return true;
-}
-
-std::shared_ptr<Node> snippets::op::Store::clone_with_new_inputs(const OutputVector& new_args) const {
+std::shared_ptr<Node> Store::clone_with_new_inputs(const OutputVector& new_args) const {
     INTERNAL_OP_SCOPE(Store);
     check_new_args_count(this, new_args);
     return std::make_shared<Store>(new_args.at(0), m_count);
 }
 
-void snippets::op::Store::validate_and_infer_types() {
-    set_output_type(0, get_input_element_type(0), get_input_partial_shape(0));
-}
-
-bool snippets::op::Store::evaluate(const HostTensorVector& output_values, const HostTensorVector& input_values) const {
-    INTERNAL_OP_SCOPE(Store);
-    NGRAPH_CHECK(input_values.size() == this->inputs().size(), "wrong input config");
-    NGRAPH_CHECK(output_values.size() == this->outputs().size(), "wrong output config");
-    NGRAPH_CHECK(input_values.size() == output_values.size() && input_values.size() == 1, "must be 1->1 operation");
-    NGRAPH_CHECK(this->output(0).get_shape() == output_values[0]->get_shape(), "output vector must have the same shape as output port");
-    NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
-    NGRAPH_CHECK(this->input(0).get_shape() == input_values[0]->get_shape(), "input and output must have same shape");
-
-    std::copy(input_values[0]->get_data_ptr<uint8_t>(),
-        input_values[0]->get_data_ptr<uint8_t>() + shape_size(get_output_shape(0))*output_values[0]->get_element_type().size(),
-        output_values[0]->get_data_ptr<uint8_t>());
-
-    return true;
-}
+} // namespace op
+} // namespace snippets
+} // namespace ngraph
\ No newline at end of file
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index 72573f5519a089..8cdd858a90a7a1 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -14,6 +14,7 @@
 #include "snippets/pass/convert_constants.hpp"
 #include "snippets/pass/convert_power_to_powerstatic.hpp"
 #include "snippets/pass/vector_to_scalar.hpp"
+#include "snippets/pass/insert_loops.hpp"
 #include "snippets/pass/transform_convert.hpp"
 #include "snippets/pass/align_element_type.hpp"
 #include "snippets/utils.hpp"
@@ -62,6 +63,36 @@ std::shared_ptr<Node> snippets::op::Subgraph::clone_with_new_inputs(const Output
     return make_shared<Subgraph>(inputs, ov::clone_model(*m_body.get()));
 }
 
+std::vector<PartialShape> snippets::op::Subgraph::reshape_body(const std::vector<PartialShape>& input_shapes) {
+    auto& params = m_body->get_parameters();
+    OPENVINO_ASSERT(params.size() == input_shapes.size(), "Got invalid number of input shapes to reshape subgraph body");
+    for (size_t i = 0; i < params.size(); ++i) {
+        params[i]->set_partial_shape(input_shapes[i]);
+    }
+    m_body->validate_nodes_and_infer_types();
+    std::vector<PartialShape> output_shapes;
+    for (const auto& res : m_body->get_results()) {
+        output_shapes.emplace_back(res->get_input_partial_shape(0));
+    }
+    return output_shapes;
+}
+
+std::vector<Shape> snippets::op::Subgraph::reshape_body(const std::vector<Shape>& input_shapes) {
+    auto& params = m_body->get_parameters();
+    OPENVINO_ASSERT(params.size() == input_shapes.size(), "Got invalid number of input shapes to reshape subgraph body");
+    for (size_t i = 0; i < params.size(); ++i) {
+        params[i]->set_partial_shape(input_shapes[i]);
+    }
+    m_body->validate_nodes_and_infer_types();
+    std::vector<Shape> output_shapes;
+    for (const auto& res : m_body->get_results()) {
+        auto pshape = res->get_input_partial_shape(0);
+        OPENVINO_ASSERT(pshape.is_static(), "Subgraph inferred dynamic output shape during reshape with static inputs");
+        output_shapes.emplace_back(res->get_input_partial_shape(0).get_shape());
+    }
+    return output_shapes;
+}
+
 void snippets::op::Subgraph::validate_and_infer_types() {
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::validate_and_infer_types")
@@ -169,7 +200,8 @@ void snippets::op::Subgraph::fill_empty_output_names(const Output<Node>& target_
 ///             * None: all inputs have the same layout
 ///             * Planar + blocked: some inputs have blocked, and some have planar layouts, e.g. <N, C, H, W, c> + <N, C, H, W>
 ///         Also there is precision aligning inside body of subgraph during canonicalization
-Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes, const BlockedShapeVector& inputShapes) {
+ov::PartialShape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShapes,
+                                                      const BlockedShapeVector& inputShapes) {
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::canonicalize")
     NODE_VALIDATION_CHECK(this, inputShapes.size() == m_body->get_parameters().size(),
@@ -184,31 +216,30 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
                             return std::get<0>(lhs).size() < std::get<0>(rhs).size();
                          });
     };
-    Shape baseShape;
+    PartialShape baseShape;
     AxisVector baseOrder;
     std::tie(baseShape, baseOrder, std::ignore) = getMaxRankBlockedShape(inputShapes);
     const auto baseRank = baseShape.size();
     const bool baseIsBlocked = baseOrder.size() != std::set<size_t>(baseOrder.begin(), baseOrder.end()).size();
     for (size_t i = 0; i < inputShapes.size(); i++) {
         const auto &blockedShape = inputShapes[i];
-        Shape inShape;
+        PartialShape inShape;
         AxisVector inOrder;
         element::Type inType;
         std::tie(inShape, inOrder, inType) = blockedShape;
         const auto inRank = inShape.size();
         NODE_VALIDATION_CHECK(this, inRank <= baseRank, "Input rank can't be larger than output rank in snippets.");
         if (inRank < baseRank) {
-            Shape newShape(baseRank, 1);
+            PartialShape newShape(ov::Shape(baseRank, 1));
             // todo: more complicated logics is needed if we want to merge smth else than blocked and planar
-            // could be done by PartialShape::broadcast_merge_into, but this way is faster
-            size_t startOffset = baseRank - inRank;
             if (baseIsBlocked) {
                 const bool inIsNotBlocked = inOrder.size() == std::set<size_t>(inOrder.begin(), inOrder.end()).size();
                 NODE_VALIDATION_CHECK(this, inIsNotBlocked, "Snippets don't support conversion between blocked layouts of different ranks");
-                startOffset--;
+                inShape.insert(inShape.end(), ov::Dimension(1));
             }
-            std::copy(inShape.begin(), inShape.end(), &newShape[startOffset]);
-            inShape = move(newShape);
+            NODE_VALIDATION_CHECK(this, PartialShape::broadcast_merge_into(newShape, inShape, ov::op::AutoBroadcastType::NUMPY),
+                                  "Failed to broadcast_merge inputs in snippets canonicalization");
+            inShape = std::move(newShape);
         } else {
             // todo: 4d blocked + 5d planar layouts are not supported: <N, C, H, W, c> + <N, C, D, H, W>
             NODE_VALIDATION_CHECK(this,
@@ -219,30 +250,30 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
         NODE_VALIDATION_CHECK(this,
                               PartialShape::broadcast_merge_into(tmpPShape, inShape, ::ngraph::op::AutoBroadcastType::NUMPY),
                               "Failed to create broadcastable shapes in snippets canonicalization");
-        const auto paramShape = m_body->get_parameters()[i]->get_shape();
+        const auto paramShape = m_body->get_parameters()[i]->get_partial_shape();
         const auto paramType =  m_body->get_parameters()[i]->get_element_type();
         if (paramShape.size() != inShape.size() || !equal(paramShape.begin(), paramShape.end(), inShape.begin()))
                 m_body->replace_parameter(i, std::make_shared<opset1::Parameter>(paramType, inShape));
     }
-
     m_body->validate_nodes_and_infer_types();
-    auto skipStartEndOnes = [](const Shape& shape) {
+    auto skipStartEndOnes = [](const PartialShape& shape) {
         auto begin = shape.begin();
         auto end = shape.end();
         while (begin != end && *begin == 1)
             begin++;
         while (begin != end && *(end-1) == 1)
             end--;
-        Shape trimmedShape(end - begin, 1);
+
+        PartialShape trimmedShape(std::vector<ov::Dimension> (end - begin, 1));
         std::copy(begin, end, trimmedShape.begin());
         return trimmedShape;
     };
 
     // Check that output shapes are broadcastable => can be scheduled
     const auto& body_results = m_body->get_results();
-    PartialShape outPShape = body_results[0]->get_shape();
+    PartialShape outPShape = body_results[0]->get_input_partial_shape(0);
     for (size_t i = 0; i < body_results.size(); i++) {
-        auto shape_i = body_results[i]->get_shape();
+        auto shape_i = body_results[i]->get_input_partial_shape(0);
         auto outputShape_i = std::get<0>(outputShapes[i]);
         // Check that the produced output shape corresponds to the passed shape
         // Some produced shapes may have been changed to be broadcastable (e.g. blocked + planar outputs),
@@ -250,9 +281,7 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
         PartialShape pShape_i(skipStartEndOnes(shape_i));
         bool compatibleWithPassedShape = PartialShape::broadcast_merge_into(pShape_i, skipStartEndOnes(outputShape_i),
                                                                               ::ngraph::op::AutoBroadcastType::NUMPY);
-        NODE_VALIDATION_CHECK(this, ov::shape_size(shape_i) == ov::shape_size(outputShape_i) &&
-                              compatibleWithPassedShape, "Inferred and passed results shapes are incompatible for snippet ",
-                              get_friendly_name(), " : ", shape_i, " vs ", outputShape_i, ".");
+        NODE_VALIDATION_CHECK(this, compatibleWithPassedShape, "Inferred and passed results shapes are incompatible for snippet ");
         // Check that output shapes are broadcastable to each other => can be scheduled
         bool compatibleWithOtherOutputs = PartialShape::broadcast_merge_into(outPShape, shape_i,
                                                                ::ngraph::op::AutoBroadcastType::NUMPY);
@@ -263,8 +292,18 @@ Shape snippets::op::Subgraph::canonicalize(const BlockedShapeVector& outputShape
     // to align precision inside Subgraph body that is supported by Plugin
     align_element_types(outputShapes, inputShapes);
 
-    exec_domain = outPShape.get_shape();
-    return exec_domain;
+    master_shape = outPShape;
+    return master_shape;
+}
+
+PartialShape snippets::op::Subgraph::get_master_shape() {
+    auto results = m_body->get_results();
+    PartialShape outPShape = results[0]->get_input_partial_shape(0);
+    for (const auto& r : results)
+        PartialShape::broadcast_merge_into(outPShape, r->get_input_shape(0),
+                                           ::ngraph::op::AutoBroadcastType::NUMPY);
+    master_shape = outPShape;
+    return master_shape;
 }
 
 void snippets::op::Subgraph::align_element_types(const BlockedShapeVector& outputShapes,
@@ -307,42 +346,60 @@ void snippets::op::Subgraph::convert_to_snippet_dialect() {
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::convert_to_snippet_dialect")
     auto skip_matching_domain = [](const std::shared_ptr<const ov::Node>& n) -> bool {
-        return n->get_input_shape(0).back() != 1;
+        const auto& pshape = n->get_input_partial_shape(0);
+        const auto& last_dim = pshape[pshape.size() - 1];
+        return last_dim.is_dynamic() || last_dim.get_length() != 1;
     };
 
     // At the moment we support only full vector Load/Store and scalar Load/Store so that count is equal to lanes.
     // Then we are going to support variadic Load/Store with different element count
     const size_t count = m_generator->get_target_machine()->get_lanes();
+    const auto & params = m_body->get_parameters();
 
+    bool inputs_has_dynamic_last_dims = std::any_of(params.begin(), params.end(),
+                                                    [](const shared_ptr<ngraph::op::Parameter>& p){
+                                                        return p->get_partial_shape().rbegin()->is_dynamic();
+                                                    });
     ngraph::pass::Manager manager;
     manager.register_pass<snippets::pass::ConvertConstantsToScalars>();
     manager.register_pass<snippets::pass::ConvertPowerToPowerStatic>();
     manager.register_pass<snippets::pass::InsertLoad>(count);
     manager.register_pass<snippets::pass::InsertStore>(count);
-    manager.register_pass<snippets::pass::InsertMoveBroadcast>();
-    manager.register_pass<snippets::pass::LoadMoveBroadcastToBroadcastLoad>();
-    // Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for
-    // simple subgraphs where one of the ngraph::op's inputs is broadcasted to match the larger one. However, BroadcastMove
-    // could also be inserted after the ngraph::op, if the op input don't need broadcasting, but the the output does
-    // (for example, to match the larger output of a child node). In such cases, Loads (and Stores) should be replaced
-    // with ScalarLoads (ScalarStores) to avoid invalid read in vector Tile. Graph example:
-    // Parameter_0    Parameter_1        Parameter_2
-    // [1,2,5,16]      [1,2,5,1]          [1,2,5,1]
-    //   Load        BroadcastLoad         Load*       Scalar
-    //          Add                             Subtract
-    //            \___________     ___________BroadcastMove
-    //                        \   /
-    //                       Multiply
-    //                         Store
-    //                        Result
-    // Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Tile.
-    if (!exec_domain.empty() && exec_domain.back() != 1) {
-        manager.register_pass<snippets::pass::SetScalarCountForLoad>();
-        manager.register_pass<snippets::pass::SetScalarCountForStore>();
-        manager.get_pass_config()->
-        set_callback<ngraph::snippets::pass::SetScalarCountForLoad>(skip_matching_domain);
-        manager.get_pass_config()->
-        set_callback<ngraph::snippets::pass::SetScalarCountForStore>(skip_matching_domain);
+    // todo: presently dynamic pipeline is activated even if the last two dimension are static
+    //  In general, we can use static kernels in this case, but several parameters (src and dst memory pointers for example)
+    //  should be passed as run-time args, so it's a mixed regime: kernel is shape-aware, but some additional runtime args are required
+    // Presently Broadcasting is organized in the following way:
+    // * ALL last dims are static => broadcasting is handled via MoveBroadcast and pointer arithmetics (even for dynamic upper dims)
+    if (!inputs_has_dynamic_last_dims) {
+        manager.register_pass<snippets::pass::InsertMoveBroadcast>();
+        manager.register_pass<snippets::pass::LoadMoveBroadcastToBroadcastLoad>();
+        // Note that, BrodacastMove is typically inserted right after the Load. Such cases are typical for
+        // simple subgraphs where one of the ngraph::op's inputs is broadcasted to match the larger one. However, BroadcastMove
+        // could also be inserted after the ngraph::op, if the op input don't need broadcasting, but the output does
+        // (for example, to match the larger output of a child node). In such cases, Loads (and Stores) should be replaced
+        // with ScalarLoads (ScalarStores) to avoid invalid read in vector Loop. Graph example:
+        // Parameter_0    Parameter_1        Parameter_2
+        // [1,2,5,16]      [1,2,5,1]          [1,2,5,1]
+        //   Load        BroadcastLoad         Load*       Scalar
+        //          Add                             Subtract
+        //            \___________     ___________BroadcastMove
+        //                        \   /
+        //                       Multiply
+        //                         Store
+        //                        Result
+        // Note: Load* should be replaced with ScalarLoad in this example to avoid invalid read in vector Loop.
+        if (master_shape.size() != 0 && master_shape[master_shape.size() - 1] != 1) {
+            manager.register_pass<snippets::pass::SetScalarCountForLoad>();
+            manager.register_pass<snippets::pass::SetScalarCountForStore>();
+            manager.get_pass_config()->
+                    set_callback<ngraph::snippets::pass::SetScalarCountForLoad>(skip_matching_domain);
+            manager.get_pass_config()->
+                    set_callback<ngraph::snippets::pass::SetScalarCountForStore>(skip_matching_domain);
+        }
+        // todo: get_lanes() assumes fp32. Could there be any int8 issues?
+        // Note that InsertLoops requires validate_and_infer_types afterwards, so add it manually if
+        // automatic validation will be disabled in the pass manager
+        manager.register_pass<snippets::pass::InsertLoops>(master_shape, m_generator->get_target_machine()->get_lanes());
     }
     manager.run_passes(m_body);
 }
@@ -371,14 +428,14 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt,
     INTERNAL_OP_SCOPE(Subgraph);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::generate")
     NGRAPH_CHECK(m_generator != nullptr, "generate is called while generator is not set");
+
+
     convert_to_snippet_dialect();
     opt.run_passes(m_body);
 
-    // generation flow
     snippets::pass::AssignRegisters().run_on_model(m_body);
 
     // schedule generation should go here and be target agnostic
-
     // actual code emission
     ngraph::snippets::code ptr = m_generator->generate(m_body, compile_params);
 
@@ -393,7 +450,7 @@ snippets::Schedule snippets::op::Subgraph::generate(ngraph::pass::Manager& opt,
     }
     NGRAPH_CHECK(!constants.size(), "External constants detected. Snippet is illigal for scheduling");
 
-    return {exec_domain, false /*canBeLinearized*/, ptr};
+    return {master_shape, false /*canBeLinearized*/, ptr};
 }
 
 void snippets::op::Subgraph::print() const {
diff --git a/src/common/snippets/src/op/tile.cpp b/src/common/snippets/src/op/tile.cpp
deleted file mode 100644
index b37e212fdcf88d..00000000000000
--- a/src/common/snippets/src/op/tile.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/op/tile.hpp"
-#include "snippets/generator.hpp"
-
-using namespace std;
-using namespace ngraph;
-
-snippets::op::Tile::Tile(const std::vector<AllocatedEmitter>& region, size_t increment,
-                         size_t num_inputs, size_t num_outputs,
-                         const std::vector<size_t>& io_dims, const std::vector<size_t>& io_data_sizes) :
-      Op(), region(region), increment(increment), num_inputs(num_inputs), num_outputs(num_outputs), io_dims(io_dims), io_data_size(io_data_sizes) {
-}
diff --git a/src/common/snippets/src/op/tile_scheduler.cpp b/src/common/snippets/src/op/tile_scheduler.cpp
deleted file mode 100644
index fd0ba9e6a23223..00000000000000
--- a/src/common/snippets/src/op/tile_scheduler.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright (C) 2018-2022 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
-
-#include "snippets/op/tile_scheduler.hpp"
-#include "snippets/generator.hpp"
-
-ngraph::snippets::op::TileScheduler::TileScheduler(const AllocatedEmitter& vector_region, const AllocatedEmitter& scalar_region)
-    : Op(), vector_region{vector_region}, scalar_region{scalar_region} {
-}
diff --git a/src/common/snippets/src/pass/assign_registers.cpp b/src/common/snippets/src/pass/assign_registers.cpp
index a0429fcfc9996f..7478ed39263ff1 100644
--- a/src/common/snippets/src/pass/assign_registers.cpp
+++ b/src/common/snippets/src/pass/assign_registers.cpp
@@ -2,81 +2,166 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-// #include <openvino/cc/selective_build.h>
 #include <snippets/itt.hpp>
-#include "snippets/remarks.hpp"
-
 #include "snippets/pass/assign_registers.hpp"
 #include "snippets/snippets_isa.hpp"
 
-#include <ngraph/opsets/opset1.hpp>
-
-#include <iterator>
-
 bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr<ov::Model>& f) {
     RUN_ON_MODEL_SCOPE(AssignRegisters);
     OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::AssignRegisters")
     using Reg = size_t;
+    using tensor = std::shared_ptr<descriptor::Tensor>;
     auto ops = f->get_ordered_ops();
-    decltype(ops) stmts;
-    std::copy_if(ops.begin(), ops.end(), std::back_inserter(stmts), [](decltype(ops[0]) op) {
-        return !(std::dynamic_pointer_cast<opset1::Parameter>(op) || std::dynamic_pointer_cast<opset1::Result>(op));
-        });
+    // Note that currently there are 3 types of ops:
+    //  * gpr->gpr: (Parameter, Result, LoopBegin, LoopEnd) will also be Buffer?
+    //  * gpr->vec: or vec->gpr Load/LoadConvert, Store/StoreConvert, BroadcastLoad etc.
+    //  * vec->vec: all other "normal" operations that perform calculations on vector registers: Add, BroadcastMove, Power, etc.
+    enum op_reg_type {gpr2gpr, gpr2vec, vec2gpr, vec2vec};
 
-    size_t rdx = 0;
-    std::map<std::shared_ptr<descriptor::Tensor>, Reg> regs;
-    for (const auto& op : stmts) {
-        for (const auto& output : op->outputs()) {
-            regs[output.get_tensor_ptr()] = rdx++;
+    auto get_op_reg_type = [](const std::shared_ptr<Node>& op) {
+        if (std::dynamic_pointer_cast<opset1::Parameter>(op) ||
+                std::dynamic_pointer_cast<opset1::Result>(op) ||
+                std::dynamic_pointer_cast<op::LoopBegin>(op) ||
+                std::dynamic_pointer_cast<op::LoopEnd>(op))
+            return gpr2gpr;
+        else if (std::dynamic_pointer_cast<snippets::op::Load>(op) ||
+                 std::dynamic_pointer_cast<snippets::op::BroadcastLoad>(op))
+            return gpr2vec;
+        else if (std::dynamic_pointer_cast<snippets::op::Store>(op))
+            return vec2gpr;
+        else
+            return vec2vec;
+    };
+    std::vector<std::pair<op_reg_type, std::shared_ptr<Node>>> typed_ops;
+    for (const auto& op : ops)
+        typed_ops.emplace_back(std::make_pair(get_op_reg_type(op), op));
+    size_t counter_vec = 0;
+    size_t counter_gpr = 0;
+    std::map<tensor, Reg> regs_vec, regs_gpr;
+    // Define a set of immune tensors that will be ignored by auto reg allocation => their reg allocation is done manually
+    // todo: presently it hold only gpr's. If you need to manually assign vec's, implement reg_type or create a second map
+    std::map<tensor, Reg> manually_assigned_regs;
+    const auto IS_MANUALLY_ALLOCATED_REG = SIZE_MAX;
+    const auto num_parameters = f->get_parameters().size();
+    for (const auto& op : ops) {
+        if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(op)) {
+            manually_assigned_regs[op->output(0).get_tensor_ptr()] =
+                    static_cast<Reg>(f->get_parameter_index(param));
+        } else if (const auto& result = ov::as_type_ptr<opset1::Result>(op)) {
+            // here we use the fact that Result input & output tensors are identical by construction
+            manually_assigned_regs[op->output(0).get_tensor_ptr()] =
+                    static_cast<Reg>(f->get_result_index(result) + num_parameters);
         }
     }
-
-    std::vector<std::set<Reg>> used;
-    std::vector<std::set<Reg>> def;
-
-    for (const auto& op : stmts) {
-        std::set<Reg> u;
-        for (const auto& input : op->inputs()) {
-            if (regs.count(input.get_tensor_ptr())) {
-                u.insert(regs[input.get_tensor_ptr()]);
+    auto enumerate_out_tensors = [IS_MANUALLY_ALLOCATED_REG, &manually_assigned_regs] (const std::shared_ptr<ov::Node>& op,
+                                     decltype(regs_vec)& reg_map,
+                                     size_t& counter) {
+        for (const auto& output : op->outputs()) {
+            const auto& t = output.get_tensor_ptr();
+            // Note that some ops might have identical input&output tensors (Result and Tile* for ex.)
+            // so we have to check that the tensor has not been enumerated already
+            if (reg_map.count(t) == 0) {
+                reg_map[t] = manually_assigned_regs.count(t) == 0 ? counter++ : IS_MANUALLY_ALLOCATED_REG;
             }
         }
-        used.push_back(u);
+    };
+    for (const auto& t_op : typed_ops) {
+        switch (t_op.first) {
+            case vec2vec:
+            case gpr2vec:
+                enumerate_out_tensors(t_op.second, regs_vec, counter_vec);
+                break;
+            case gpr2gpr:
+            case vec2gpr:
+                enumerate_out_tensors(t_op.second, regs_gpr, counter_gpr);
+                break;
+        }
+    }
+    // todo: make one for gpr and one for vector
+    std::vector<std::set<Reg>> used_gpr(ops.size(), std::set<Reg>()); // used = used as an input
+    std::vector<std::set<Reg>> defined_gpr(ops.size(), std::set<Reg>()); // defined = used as output
+    std::vector<std::set<Reg>> used_vec(ops.size(), std::set<Reg>());
+    std::vector<std::set<Reg>> defined_vec(ops.size(), std::set<Reg>());
 
-        std::set<Reg> d;
-        if (!std::dynamic_pointer_cast<snippets::op::Store>(op)) {
-            for (const auto& output : op->outputs()) {
-                d.insert(regs[output.get_tensor_ptr()]);
-            }
+    auto tensor2reg = [IS_MANUALLY_ALLOCATED_REG] (const std::vector<tensor>& tensors, const std::map<tensor, Reg>& reg_map) {
+        std::set<Reg> result;
+        for (const auto& t : tensors) {
+            if (reg_map.count(t) == 0)
+                ngraph::ngraph_error("Assign registers: attempt to access not enumerated tensor");
+            Reg reg_id = reg_map.at(t);
+            if (reg_id != IS_MANUALLY_ALLOCATED_REG)
+                result.insert(reg_id);
+        }
+        return result;
+    };
+    for (int i = 0; i < typed_ops.size(); i++) {
+        const auto& t_op = typed_ops[i];
+        std::vector<tensor> used_tensors, defined_tensors;
+        for (const auto& in : t_op.second->inputs())
+            used_tensors.push_back(in.get_tensor_ptr());
+        for (const auto& out : t_op.second->outputs())
+            defined_tensors.push_back(out.get_tensor_ptr());
+        switch (t_op.first) {
+            case vec2vec:
+                used_vec[i] = tensor2reg(used_tensors, regs_vec);
+                defined_vec[i] = tensor2reg(defined_tensors, regs_vec);
+                break;
+            case gpr2gpr:
+                used_gpr[i] = tensor2reg(used_tensors, regs_gpr);
+                defined_gpr[i] = tensor2reg(defined_tensors, regs_gpr);
+                break;
+            case gpr2vec:
+                used_gpr[i] = tensor2reg(used_tensors, regs_gpr);
+                defined_vec[i] = tensor2reg(defined_tensors, regs_vec);
+                break;
+            case vec2gpr:
+                used_vec[i] = tensor2reg(used_tensors, regs_vec);
+                defined_gpr[i] = tensor2reg(defined_tensors, regs_gpr);
+                break;
         }
-        def.push_back(d);
     }
 
     // define life intervals
-    std::vector<std::set<Reg>> lifeIn(stmts.size(), std::set<Reg>());
-    std::vector<std::set<Reg>> lifeOut(stmts.size(), std::set<Reg>());
+    // liveOut[i] - regs that are live on exit from i-th (topologically ordered) operation
+    // liveIn[i] - regs that are live on entering the i-th (topologically ordered) operation
+    std::vector<std::set<Reg>> life_in_vec(std::move(used_vec));
+    std::vector<std::set<Reg>> life_out_vec(typed_ops.size(), std::set<Reg>());
+    std::vector<std::set<Reg>> life_in_gpr(std::move(used_gpr));
+    std::vector<std::set<Reg>> life_out_gpr(typed_ops.size(), std::set<Reg>());
 
-    for (size_t i = 0; i < stmts.size(); i++) {
-        for (size_t n = 0; n < stmts.size(); n++) {
-            std::set_difference(lifeOut[n].begin(), lifeOut[n].end(), def[n].begin(), def[n].end(), std::inserter(lifeIn[n], lifeIn[n].begin()));
-            lifeIn[n].insert(used[n].begin(), used[n].end());
+    // todo: this part if O(N*N), so it's slow for large subgraphs. Can we simplify it? At least add an early stopping criteria
+    for (size_t i = 0; i < typed_ops.size(); i++) {
+        for (size_t n = 0; n < typed_ops.size(); n++) {
+            // Regs that are live on entering the operation = regs used by the op + (all other regs alive - regs defined by the op)
+            // copy regs from lifeOut to lifeIn while ignoring regs in def
+            std::set_difference(life_out_gpr[n].begin(), life_out_gpr[n].end(),
+                                defined_gpr[n].begin(), defined_gpr[n].end(),
+                                std::inserter(life_in_gpr[n], life_in_gpr[n].begin()));
+            std::set_difference(life_out_vec[n].begin(), life_out_vec[n].end(),
+                                defined_vec[n].begin(), defined_vec[n].end(),
+                                std::inserter(life_in_vec[n], life_in_vec[n].begin()));
         }
-        for (size_t n = 0; n < stmts.size(); n++) {
-            auto node = stmts[n];
-            if (!std::dynamic_pointer_cast<snippets::op::Store>(node)) {
-                for (const auto& out : node->outputs()) {
-                    for (const auto& port : out.get_target_inputs()) {
-                        auto pos = std::find(stmts.begin(), stmts.end(), port.get_node()->shared_from_this());
-                        if (pos != stmts.end()) {
-                            auto k = pos-stmts.begin();
-                            lifeOut[n].insert(lifeIn[k].begin(), lifeIn[k].end());
-                        }
+        for (size_t n = 0; n < typed_ops.size(); n++) {
+            auto op = typed_ops[n].second;
+            for (const auto& out : op->outputs()) {
+                for (const auto& port : out.get_target_inputs()) {
+                    auto k = std::find(ops.begin(), ops.end(), port.get_node()->shared_from_this()) - ops.begin();
+                    if (k == ops.size())
+                        throw ngraph_error("assign registers can't find target op in the body");
+                    switch (typed_ops[k].first) {
+                        case vec2vec:
+                        case vec2gpr:
+                            life_out_vec[n].insert(life_in_vec[k].begin(), life_in_vec[k].end());
+                            break;
+                        case gpr2gpr:
+                        case gpr2vec:
+                            life_out_gpr[n].insert(life_in_gpr[k].begin(), life_in_gpr[k].end());
+                            break;
                     }
                 }
             }
         }
     }
-
     struct by_starting {
         auto operator()(const std::pair<int, int>& lhs, const std::pair<int, int>& rhs) const -> bool {
             return lhs.first < rhs.first|| (lhs.first == rhs.first && lhs.second < rhs.second);
@@ -88,13 +173,15 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
             return lhs.second < rhs.second || (lhs.second == rhs.second && lhs.first < rhs.first);
         }
     };
+    // A variable live interval - is a range (start, stop) of op indexes, such that
+    // the variable is alive within this range (defined but not used by the last user)
+    std::map<std::pair<int, int>, Reg, by_starting> live_intervals_vec, live_intervals_gpr;
 
-    std::set<std::pair<int, int>, by_starting> live_intervals;
-
-    std::reverse(lifeIn.begin(), lifeIn.end());
-    auto find_last_use = [lifeIn](int i) -> int {
-        int ln = static_cast<int>(lifeIn.size()) - 1;
-        for (auto& x : lifeIn) {
+    std::reverse(life_in_vec.begin(), life_in_vec.end());
+    std::reverse(life_in_gpr.begin(), life_in_gpr.end());
+    auto find_last_use = [](decltype(life_in_gpr) life_in, int i) -> int {
+        int ln = static_cast<int>(life_in.size()) - 1;
+        for (auto& x : life_in) {
             if (x.find(i) != x.end()) {
                 return ln;
             }
@@ -102,67 +189,83 @@ bool ngraph::snippets::pass::AssignRegisters::run_on_model(const std::shared_ptr
         }
         return i;
     };
-
-    for (size_t i = 0; i < stmts.size(); i++) {
-        live_intervals.insert(std::make_pair(static_cast<int>(i), find_last_use(static_cast<int>(i))));
+    for (int i = 0; i < static_cast<int>(typed_ops.size()); i++) {
+        for (const auto& def : defined_vec[i])
+            live_intervals_vec[std::make_pair(i, find_last_use(life_in_vec, static_cast<int>(def)))] = def;
+        for (const auto& def : defined_gpr[i])
+            live_intervals_gpr[std::make_pair(i, find_last_use(life_in_gpr, static_cast<int>(def)))] = def;
     }
 
-    // http://web.cs.ucla.edu/~palsberg/course/cs132/linearscan.pdf
-    std::multiset<std::pair<int, int>, by_ending> active;
-    std::map<Reg, Reg> register_map;
-    std::stack<Reg> bank;
-    for (int i = 0; i < 16; i++) bank.push(16-1-i);
+    auto linescan_assign_registers = [](const decltype(live_intervals_vec)& live_intervals,
+                                        const std::set<Reg>& reg_pool) {
+        // http://web.cs.ucla.edu/~palsberg/course/cs132/linearscan.pdf
+        // todo: do we need multimap? <=> can an op have two inputs from the same op?
+        std::map<std::pair<int, int>, Reg, by_ending> active;
+        // uniquely defined register => reused reg (reduced subset enabled by reg by reusage)
+        std::map<Reg, Reg> register_map;
+        std::stack<Reg> bank;
+        // regs are stored in ascending order in reg_pool, so walk in reverse to assign them the same way
+        for (auto rit = reg_pool.crbegin(); rit != reg_pool.crend(); rit++)
+            bank.push(*rit);
 
-    for (auto interval : live_intervals) {
-        // check expired
-        while (!active.empty()) {
-            auto x = *active.begin();
-            if (x.second >= interval.first) {
-                break;
+        std::pair<int, int> interval, active_interval;
+        Reg unique_reg, active_unique_reg;
+        for (const auto& interval_reg : live_intervals) {
+            std::tie(interval, unique_reg) = interval_reg;
+            // check expired
+            while (!active.empty()) {
+                std::tie(active_interval, active_unique_reg) = *active.begin();
+                // if end of active interval has not passed yet => stop removing actives since they are sorted by end
+                if (active_interval.second >= interval.first) {
+                    break;
+                }
+                active.erase(active_interval);
+                bank.push(register_map[active_unique_reg]);
+            }
+            // allocate
+            if (active.size() == reg_pool.size()) {
+                // todo: if it is LoopBegin or LoopEnd that requires gpr, and we don't have any in the pool,
+                //  then assign SIZE_MAX-1 as a flag to spill a reg inside emitter
+                throw ngraph::ngraph_error("can't allocate registers for a snippet ");
+            } else {
+                register_map[unique_reg] = bank.top();
+                bank.pop();
+                active.insert(interval_reg);
             }
-            active.erase(x);
-            bank.push(register_map[x.first]);
-        }
-        // allocate
-        if (active.size() == 16) {
-            throw ngraph_error("caanot allocate registers for a snippet ");
-        } else {
-            register_map[interval.first] = bank.top();
-            bank.pop();
-            active.insert(interval);
         }
-    }
+        return register_map;
+    };
+    // todo: vec_/gpr_pool are hardware-specific and should be provided by a backend, e.g. overloaded generator
+    std::set<Reg> vec_pool;
+    for (Reg i  = 0; i < 16; i++)
+        vec_pool.insert(i);
+    auto unique2reused_map_vec = linescan_assign_registers(live_intervals_vec, vec_pool);
+    std::set<Reg> gpr_pool(std::move(vec_pool));
+    for (const auto& t_reg : manually_assigned_regs)
+        gpr_pool.erase(t_reg.second);
+    auto unique2reused_map_gpr = linescan_assign_registers(live_intervals_gpr, gpr_pool);
 
-    std::map<std::shared_ptr<descriptor::Tensor>, Reg> physical_regs;
+    std::map<tensor, Reg> assigned_regs(std::move(manually_assigned_regs));
+    auto register_assigned_regs = [IS_MANUALLY_ALLOCATED_REG, &assigned_regs](const std::map<tensor, Reg>& unique_regs,
+                                                   const std::map<Reg, Reg>& unique2reused) {
+        for (const auto& reg : unique_regs) {
+            if (reg.second == IS_MANUALLY_ALLOCATED_REG)
+                continue;
+            if (unique2reused.count(reg.second) == 0)
+                ngraph::ngraph_error("Assign registers failed to allocate register for a tensor");
+            assigned_regs[reg.first] = unique2reused.at(reg.second);
+        }
+    };
+    register_assigned_regs(regs_vec, unique2reused_map_vec);
+    register_assigned_regs(regs_gpr, unique2reused_map_gpr);
 
-    for (const auto& reg : regs) {
-        physical_regs[reg.first] = register_map[reg.second];
-    }
-    const auto num_parameters = f->get_parameters().size();
-    for (const auto& n : f->get_ordered_ops()) {
-        auto& rt = n->get_rt_info();
-        std::vector<size_t> regs;
-        regs.reserve(n->outputs().size());
-        /* The main idea here is that each operation stores its output regs in rt["reginfo"]. Input and output regs are
-         * then derived by parsing node's and parent's rt["reginfo"], look into ngraph::snippets::getRegisters for details.
-         * Note also that Parameter and Result store general-purpose register index, because they work with memory
-         * (memory pointer is stored in gpr). All other "regular" ops store vector regs indexes, since calculations are
-         * performed on registers.
-         */
-        if (is_type<ov::op::v0::Result>(n)) {
-            continue;
-        } else if (const auto& param = ov::as_type_ptr<ov::op::v0::Parameter>(n)) {
-            regs.push_back(f->get_parameter_index(param));
-        } else if (const auto& store = ov::as_type_ptr<ngraph::snippets::op::Store>(n)) {
-            regs.push_back(f->get_result_index(store) + num_parameters);
-        } else {
-            for (const auto& output : n->outputs()) {
-                auto allocated = physical_regs[output.get_tensor_ptr()];
-                regs.push_back(allocated);
-            }
+    for (const auto& t_op : typed_ops) {
+        for (const auto& out : t_op.second->outputs()) {
+            const auto& t = out.get_tensor_ptr();
+            auto& rt = t->get_rt_info();
+            rt["reginfo"] = static_cast<size_t>(assigned_regs[t]);
         }
-        rt["reginfo"] = regs;
     }
-
     return false;
 }
+
diff --git a/src/common/snippets/src/pass/collapse_subgraph.cpp b/src/common/snippets/src/pass/collapse_subgraph.cpp
index 57c737f992b89e..592601f681c682 100644
--- a/src/common/snippets/src/pass/collapse_subgraph.cpp
+++ b/src/common/snippets/src/pass/collapse_subgraph.cpp
@@ -14,6 +14,7 @@
 #include <ngraph/rt_info.hpp>
 #include <ngraph/op/loop.hpp>
 #include "transformations/utils/utils.hpp"
+#include "ngraph/op/util/attr_types.hpp"
 
 #include <memory>
 #include <vector>
@@ -32,29 +33,15 @@ namespace pass {
 namespace {
 
 auto outputs_are_not_broadcastable(const std::shared_ptr<const Node>& node) -> bool {
-    auto outputs = node->outputs();
-    auto find_smallest_output_shape = [](const std::vector<Output<const Node>>& outputs) -> Shape {
-        return std::accumulate(std::begin(outputs), std::end(outputs), ngraph::Shape(outputs.begin()->get_shape()),
-            [](Shape& other_shape, const Output<const Node>& output){
-                return shape_size(output.get_shape()) < shape_size(other_shape) ? output.get_shape() : other_shape;
-            });
-    };
-    auto ref_shape = find_smallest_output_shape(outputs);
-
-    auto check_shapes_broadcastable = [ref_shape](const Output<const Node>& output) -> bool {
-        auto other_shape = output.get_shape();
-
-        if (other_shape.size() != ref_shape.size()) {
-            return false;
-        }
-
-        return std::inner_product(std::begin(other_shape), std::end(other_shape), std::begin(ref_shape), true,
-                            std::logical_and<bool>(), [](Shape::value_type lsh, Shape::value_type rsh){
-                                return rsh == 1 || lsh == rsh;
-                            });
-    };
-
-    return std::find_if_not(std::begin(outputs), std::end(outputs), check_shapes_broadcastable) != std::end(outputs);
+    const auto& outputs = node->outputs();
+    if (outputs.size() <= 1)
+        return false;
+    ov::PartialShape ref_shape = outputs.front().get_partial_shape();
+    bool success = true;
+    for (int i = 1; i < outputs.size() && success; i++) {
+        success &= ov::PartialShape::broadcast_merge_into(ref_shape, outputs[i].get_partial_shape(), ov::op::AutoBroadcastType::NUMPY);
+    }
+    return !success;
 }
 
 auto is_supported_op(const std::shared_ptr<const Node> &n) -> bool {
diff --git a/src/common/snippets/src/pass/convert_constants.cpp b/src/common/snippets/src/pass/convert_constants.cpp
index 3cb791d0130163..3c2e8cee2a7a6f 100644
--- a/src/common/snippets/src/pass/convert_constants.cpp
+++ b/src/common/snippets/src/pass/convert_constants.cpp
@@ -20,11 +20,14 @@ ngraph::snippets::pass::ConvertConstantsToScalars::ConvertConstantsToScalars() {
     ngraph::graph_rewrite_callback callback = [this](ngraph::pattern::Matcher &m) {
         OV_ITT_SCOPED_TASK(ngraph::pass::itt::domains::SnippetsTransform, "Snippets::op::ConvertConstantsToScalars")
         auto constant = as_type_ptr<ov::op::v0::Constant>(m.get_match_root());
-        auto scalar = std::make_shared<snippets::op::Scalar>(*constant);
+        if (ov::shape_size(constant->get_output_shape(0)) != 1)
+            return false;
+        //  Note that all Constants {1,1,1,1} are converted to Scalar {1} here
+        //  This is needed to simplify shape inference, otherwise {1,1,1,1} Constants can increase output rank
+        auto scalar = std::make_shared<snippets::op::Scalar>(ov::op::v0::Constant(*constant, ov::Shape{1}));
         scalar->set_friendly_name(constant->get_friendly_name());
         ngraph::copy_runtime_info(constant, scalar);
         ngraph::replace_node(constant, scalar);
-
         return true;
     };
     register_matcher(std::make_shared<ov::pass::pattern::Matcher>(constants), callback);
diff --git a/src/common/snippets/src/pass/insert_loops.cpp b/src/common/snippets/src/pass/insert_loops.cpp
new file mode 100644
index 00000000000000..30c9a20883b8d5
--- /dev/null
+++ b/src/common/snippets/src/pass/insert_loops.cpp
@@ -0,0 +1,85 @@
+// Copyright (C) 2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <snippets/itt.hpp>
+#include "snippets/pass/insert_loops.hpp"
+#include "snippets/op/loop_helpers.hpp"
+
+#include <ngraph/rt_info.hpp>
+
+ngraph::snippets::pass::InsertLoops::InsertLoops(ov::PartialShape master_shape, size_t vector_size)
+: master_shape(std::move(master_shape)), vector_size(vector_size) {
+}
+
+bool ngraph::snippets::pass::InsertLoops::run_on_model(const std::shared_ptr<ov::Model> &model) {
+    RUN_ON_FUNCTION_SCOPE(InsertLoops);
+    if (master_shape.is_dynamic())
+        throw ngraph_error("InsertLoops doesn't support dynamic shapes yet");
+
+    const auto inner_dim = master_shape.size() - 1;
+    // Note: outer_dim could overflow if master_shape.size() < 2
+    const auto outer_dim = master_shape.size() - 2;
+    const auto inner_work_amount = master_shape[inner_dim].get_length();
+    const auto outer_work_amount = master_shape.size() >= 2 ? master_shape[outer_dim].get_length() : 1;
+
+    ParameterVector commonParams = model->get_parameters();
+    // Note that topological sort parses node arguments in reversed order, but results are added  - in direct order
+    // So ve need to pass the reversed results to LoopEnd to keep the original traversal order in topological sorter
+    const auto& orig_results = model->get_results();
+    ResultVector commonResults(orig_results.rbegin(), orig_results.rend());
+    std::vector<PartialShape> ioShapes;
+    ioShapes.reserve(commonParams.size() + commonResults.size());
+    std::transform(commonParams.begin(), commonParams.end(), std::back_inserter(ioShapes),
+                   [](const std::shared_ptr<Node>& n) { return n->get_output_partial_shape(0); });
+    std::transform(commonResults.begin(), commonResults.end(), std::back_inserter(ioShapes),
+                   [](const std::shared_ptr<Node>& n) { return n->get_input_partial_shape(0); });
+
+    if (inner_work_amount > 0) {
+        std::vector<bool> apply_increments;
+        apply_increments.reserve(ioShapes.size());
+        // Inner Loop applies increments if a dimension is not broadcasted
+        std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments),
+                       [=](const PartialShape& ps) {
+                           return ps[inner_dim] != 1 && master_shape[inner_dim] != 1;
+                       });
+        std::vector<int64_t> inner_finalization_offsets(ioShapes.size(), 0);
+        if (outer_work_amount > 1) {
+            // We need to step back if an outer dim is broadcasted, while the corresponding lower one is not
+            std::transform(ioShapes.begin(), ioShapes.end(), inner_finalization_offsets.begin(),
+                           [=](const PartialShape& ps) {
+                               return ps[outer_dim] == 1 && ps[inner_dim] != 1 ? -inner_work_amount : 0;
+                           });
+        }
+        const auto& inner_loop_begin = op::insertLoopBegin(commonParams);
+        const auto& inner_loop_end = insertLoopEnd(commonResults, inner_loop_begin, inner_dim, inner_work_amount,
+                                                   vector_size, apply_increments, inner_finalization_offsets);
+        // set internal flag to enable scalar vs vector loop optimizations
+        inner_loop_end->has_outer_loop = outer_work_amount > 1;
+        // Due to features of topological sort, some Constants (Scalars) may appear right after Parameters in
+        // sorted ops (so it's between Parameters and LoopBegin). Consequently, ScalarEmitters would be called
+        // outside the Loop, and only the first Loop iteration would yield correct data (assuming the vector reg
+        // assigned to scalar will get corrupted inside the loop body). To avoid such cases, we add control dependency
+        // on LoopBegin to guarantee that the constants are executed inside the Loop.
+        for (const auto& n : model->get_ordered_ops()) {
+            if (auto c = std::dynamic_pointer_cast<ov::op::v0::Constant>(n))
+                c->add_control_dependency(inner_loop_begin);
+            else if (n == inner_loop_begin)
+                break;
+        }
+    }
+
+    if (outer_work_amount > 1) {
+        std::vector<bool> apply_increments;
+        apply_increments.reserve(ioShapes.size());
+        // Outer Loop applies increments only if a corresponding lower dim was broadcasted (or all lower dims == 1)
+        std::transform(ioShapes.begin(), ioShapes.end(), std::back_inserter(apply_increments),
+                       [=](const PartialShape& ps) {
+                           return ps[outer_dim] != 1 && ps[inner_dim] == 1;
+                       });
+        const auto& outer_loop_begin = op::insertLoopBegin(commonParams);
+        insertLoopEnd(commonResults, outer_loop_begin, outer_dim, outer_work_amount, 1, apply_increments);
+    }
+
+    return true;
+}
\ No newline at end of file
diff --git a/src/common/snippets/src/pass/insert_movebroadcast.cpp b/src/common/snippets/src/pass/insert_movebroadcast.cpp
index 23740a8aa03711..499be69e67f062 100644
--- a/src/common/snippets/src/pass/insert_movebroadcast.cpp
+++ b/src/common/snippets/src/pass/insert_movebroadcast.cpp
@@ -18,16 +18,16 @@ using namespace ngraph;
 namespace {
 
 std::shared_ptr<ngraph::Node> broadcast_node_last_dim(const ngraph::Output<ngraph::Node>& value,
-                                                   const ov::Shape& target_shape, const ov::Shape& normalized_shape) {
+                                                   const ov::PartialShape& target_shape, const ov::PartialShape& normalized_shape) {
     std::shared_ptr<ngraph::Node> broadcasted_node = value.get_node_shared_ptr();
 
-    if (target_shape == value.get_shape()) {
+    if (target_shape == value.get_partial_shape()) {
         return broadcasted_node;
     }
     // Insert BroadcastMove only if the last dimension needs to be broadcasted. Higher-level dims broadcasting
-    // will be handled by pointer arithmetics in TileScheduler
+    // will be handled by pointer arithmetics inside outer LoopEmitter
     if (*target_shape.rbegin() != *normalized_shape.rbegin()) {
-        ov::Shape broadcasted_shape = normalized_shape;
+        ov::PartialShape broadcasted_shape = normalized_shape;
         *broadcasted_shape.rbegin() = *target_shape.rbegin();
         broadcasted_node = std::make_shared<ngraph::snippets::op::BroadcastMove>(broadcasted_node, broadcasted_shape);
     }
@@ -36,20 +36,20 @@ std::shared_ptr<ngraph::Node> broadcast_node_last_dim(const ngraph::Output<ngrap
 }
 
 
-std::pair<ov::Shape, std::vector<ov::Shape>> get_numpy_broadcast_shapes(const std::vector<ov::Shape>& input_shapes) {
+std::pair<ov::PartialShape, std::vector<ov::PartialShape>> get_numpy_broadcast_partial_shapes(const std::vector<ov::PartialShape>& input_shapes) {
     ov::PartialShape target_shape =  input_shapes.front();
     for (auto i = 1; i < input_shapes.size(); i++) {
         if (!ov::PartialShape::broadcast_merge_into(target_shape, input_shapes[i], op::AutoBroadcastType::NUMPY))
             throw ngraph::ngraph_error("InsertMoveBroadcast: Failed broadcast-merge input shapes");
     }
-    std::vector<ov::Shape> normalized_shapes;
+    std::vector<ov::PartialShape> normalized_shapes;
     for (const auto& input : input_shapes) {
-        ov::Shape padded_shape{input};
+        ov::PartialShape padded_shape{input};
         padded_shape.insert(padded_shape.begin(), target_shape.size() - padded_shape.size(), 1);
         normalized_shapes.push_back(std::move(padded_shape));
     }
 
-    return {target_shape.get_shape(), normalized_shapes};
+    return {target_shape, normalized_shapes};
 }
 
 } // namespace
@@ -72,15 +72,20 @@ ngraph::snippets::pass::InsertMoveBroadcast::InsertMoveBroadcast() {
             }
             return false;
         };
-        std::vector<ov::Shape> input_shapes;
+        std::vector<ov::PartialShape> input_shapes;
         std::vector<bool> ignore_as_scalar;
         for (const auto& val : values) {
-            input_shapes.emplace_back(val.get_shape());
+            input_shapes.emplace_back(val.get_partial_shape());
             ignore_as_scalar.push_back(is_scalar_constant(val));
+            // Do not insert MoveBroadcast if any of the last dims is dynamic,
+            // since we don't know if we really need it. In these cases, broadcasting will be performed
+            // by outer Loop based on runtime shapes.
+            if (!ignore_as_scalar.back() && !input_shapes.back().rbegin()->is_static())
+                return false;
         }
 
         // find the output tensor's shape, then broadcast all inputs so that they are compatible with respect to the last dim
-        auto bcast_shapes = get_numpy_broadcast_shapes(input_shapes);
+        auto bcast_shapes = get_numpy_broadcast_partial_shapes(input_shapes);
 
         ngraph::OutputVector broadcasted_inputs;
         for (size_t i = 0; i < values.size(); ++i) {
diff --git a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
index f63f9c933af14b..f3765e471971a2 100644
--- a/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
+++ b/src/common/snippets/src/pass/load_movebroadcast_to_broadcastload.cpp
@@ -34,8 +34,8 @@ ngraph::snippets::pass::LoadMoveBroadcastToBroadcastLoad::LoadMoveBroadcastToBro
                 return false;
             }
 
-            auto inshape = root->input(0).get_shape();
-            auto outshape = root->output(0).get_shape();
+            auto inshape = root->input(0).get_partial_shape();
+            auto outshape = root->output(0).get_partial_shape();
 
             auto broadcastload = std::make_shared<snippets::op::BroadcastLoad>(param, outshape);
             ngraph::copy_runtime_info(root, broadcastload);
diff --git a/src/common/snippets/tests/include/lowering_utils.hpp b/src/common/snippets/tests/include/lowering_utils.hpp
index 5af4af2a32b099..c156534d180843 100644
--- a/src/common/snippets/tests/include/lowering_utils.hpp
+++ b/src/common/snippets/tests/include/lowering_utils.hpp
@@ -38,10 +38,15 @@ class DummyGenerator : public ngraph::snippets::Generator {
 };
 
 class LoweringTests : public TransformationTestsF {
+public:
+    void SetUp() override;
+    void TearDown() override;
 protected:
     static std::shared_ptr<ngraph::snippets::op::Subgraph> getSubgraph(const std::shared_ptr<Model>& f);
-    static std::shared_ptr<ngraph::snippets::op::Subgraph> getLoweredSubgraph(const std::shared_ptr<Model>& f);
+    static std::shared_ptr<ngraph::snippets::op::Subgraph> getLoweredSubgraph(const std::shared_ptr<Model>& f,
+                                                                              const ov::PartialShape& master_shape);
     static std::shared_ptr<ngraph::snippets::op::Subgraph> getTokenizedSubgraph(const std::shared_ptr<Model>& f);
+    ov::PartialShape master_shape{};
 };
 
 }  // namespace snippets
diff --git a/src/common/snippets/tests/src/lowering_utils.cpp b/src/common/snippets/tests/src/lowering_utils.cpp
index 4aab86d5d7c07c..ec644e62e514e1 100644
--- a/src/common/snippets/tests/src/lowering_utils.cpp
+++ b/src/common/snippets/tests/src/lowering_utils.cpp
@@ -30,8 +30,31 @@ DummyTargetMachine::DummyTargetMachine() {
     jitters[ngraph::snippets::op::Scalar::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::BroadcastMove::get_type_info_static()] = dummy_functor;
     jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = dummy_functor;
-    jitters[ngraph::snippets::op::Tile::get_type_info_static()] = dummy_functor;
-    jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = dummy_functor;
+    jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = dummy_functor;
+}
+
+void LoweringTests::SetUp() {
+    manager.register_pass<ngraph::pass::InitNodeInfo>();
+}
+
+void LoweringTests::TearDown() {
+    auto cloned_function = ngraph::clone_function(*function);
+    if (!function_ref) {
+        function_ref = cloned_function;
+    }
+    manager.run_passes(function);
+        ASSERT_NO_THROW(check_rt_info(function));
+
+    if (comparator.should_compare(FunctionsComparator::ACCURACY)) {
+        auto acc_comparator = FunctionsComparator::no_default();
+        acc_comparator.enable(FunctionsComparator::CmpValues::ACCURACY);
+        auto res = acc_comparator.compare(function, cloned_function);
+        ASSERT_TRUE(res.valid) << res.message;
+        comparator.disable(FunctionsComparator::CmpValues::ACCURACY);
+    }
+    auto res = comparator.compare(function, function_ref);
+    ASSERT_TRUE(res.valid) << res.message;
 }
 
 std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getSubgraph(const std::shared_ptr<Model>& f) {
@@ -52,9 +75,11 @@ std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getSubgraph(const
     return subgraph;
 }
 
-std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getLoweredSubgraph(const std::shared_ptr<Model> &f) {
+std::shared_ptr<ngraph::snippets::op::Subgraph> LoweringTests::getLoweredSubgraph(const std::shared_ptr<Model> &f,
+                                                                                  const ov::PartialShape& master_shape) {
     auto subgraph = getTokenizedSubgraph(f);
     subgraph->set_generator(std::make_shared<DummyGenerator>());
+    subgraph->set_master_shape(master_shape);
     subgraph->generate();
     return subgraph;
 }
diff --git a/src/common/snippets/tests/src/pass/canonicalization.cpp b/src/common/snippets/tests/src/pass/canonicalization.cpp
index 15c33e6df96e10..7b687bad226443 100644
--- a/src/common/snippets/tests/src/pass/canonicalization.cpp
+++ b/src/common/snippets/tests/src/pass/canonicalization.cpp
@@ -23,12 +23,12 @@ std::string CanonicalizationTests::getTestCaseName(testing::TestParamInfo<canoni
         // input shape
         result << "IS[" << i << "]=" << CommonTestUtils::vec2str(std::get<0>(inputs[i])) << "_";
         // input blocked shape
-        result << "IBS[" << i << "]=" << CommonTestUtils::vec2str(std::get<0>(blockedshape)) << "_";
+        result << "IBS[" << i << "]=" << CommonTestUtils::partialShape2str({std::get<0>(blockedshape)}) << "_";
         // input blocked order
         result << "IBO[" << i << "]=" << CommonTestUtils::vec2str(std::get<1>(blockedshape)) << "_";
     }
     // output blocked shape
-    result << "OBS[0]=" << CommonTestUtils::vec2str(std::get<0>(output)) << "_";
+    result << "OBS[0]=" << CommonTestUtils::partialShape2str({std::get<0>(output)}) << "_";
     // output blocked order
     result << "OBO[0]=" << CommonTestUtils::vec2str(std::get<1>(output)) << "_";
     result << "ExpOS[0]=" << CommonTestUtils::vec2str(expectedOutput) << "_";
@@ -42,7 +42,7 @@ void CanonicalizationTests::SetUp() {
     std::tie(inputs[0], inputs[1], output_blocked_shapes[0], expected_output_shape) = this->GetParam();
 
     input_blocked_shapes = {std::get<1>(inputs[0]), std::get<1>(inputs[1])};
-    snippets_function = std::make_shared<AddFunction>(std::vector<Shape>{std::get<0>(inputs[0]), std::get<0>(inputs[1])});
+    snippets_function = std::make_shared<AddFunction>(std::vector<PartialShape>{std::get<0>(inputs[0]), std::get<0>(inputs[1])});
 }
 
 TEST_P(CanonicalizationTests, Add) {
@@ -50,8 +50,9 @@ TEST_P(CanonicalizationTests, Add) {
     function_ref = snippets_function->getReference();
     auto subgraph =  getTokenizedSubgraph(function);
     subgraph->set_generator(std::make_shared<DummyGenerator>());
-    Shape canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes);
-    ASSERT_DIMS_EQ(canonical_output_shape, expected_output_shape);
+    auto canonical_output_shape = subgraph->canonicalize(output_blocked_shapes, input_blocked_shapes);
+    ASSERT_TRUE(canonical_output_shape.is_static());
+    ASSERT_DIMS_EQ(canonical_output_shape.get_shape(), expected_output_shape);
 }
 
 namespace CanonicalizationTestsInstantiation {
diff --git a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp
index 3e578119b25d19..aa26ecfe4cdb74 100644
--- a/src/common/snippets/tests/src/pass/collapse_subgraph.cpp
+++ b/src/common/snippets/tests/src/pass/collapse_subgraph.cpp
@@ -20,56 +20,56 @@ void CollapseSubgraphTests::run() {
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_Eltwise) {
-    const auto &f = EltwiseFunction(std::vector<Shape> {{2, 3}, {1, 3}});
+    const auto &f = EltwiseFunction(std::vector<PartialShape> {{2, 3}, {1, 3}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_MatMulWithEltwise) {
-    const auto &f = MatMulEltwiseBranchesFunction(std::vector<Shape> {{1, 3, 4, 4}, {1, 3, 4, 4}});
+    const auto &f = MatMulEltwiseBranchesFunction(std::vector<PartialShape> {{1, 3, 4, 4}, {1, 3, 4, 4}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_AvoidLoopEltwise) {
-    const auto &f = EltwiseLogLoopFunction(std::vector<Shape> {{2, 5}, {2, 1}});
+    const auto &f = EltwiseLogLoopFunction(std::vector<PartialShape> {{2, 5}, {2, 1}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_OneConvert) {
-    const auto &f = ConvertFunction(std::vector<Shape>{{2, 5}});
+    const auto &f = ConvertFunction(std::vector<PartialShape>{{2, 5}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertInput) {
-    const auto &f = ConvertInputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
+    const auto &f = ConvertInputFunction(std::vector<PartialShape>{{2, 5}, {1, 5}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertOutput) {
-    const auto &f = ConvertOutputFunction(std::vector<Shape>{{2, 5}, {1, 5}});
+    const auto &f = ConvertOutputFunction(std::vector<PartialShape>{{2, 5}, {1, 5}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertStub) {
-    const auto &f = ConvertStubFunction(std::vector<Shape>{{2, 5, 2}, {1, 5, 1}});
+    const auto &f = ConvertStubFunction(std::vector<PartialShape>{{2, 5, 2}, {1, 5, 1}});
     function = f.getOriginal();
     function_ref = f.getReference();
     run();
 }
 
 TEST_F(CollapseSubgraphTests, smoke_Snippets_ConvertPartialInputsAndResults) {
-    const auto &f = ConvertPartialInputsAndResultsFunction(std::vector<Shape>{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}},
+    const auto &f = ConvertPartialInputsAndResultsFunction(std::vector<PartialShape>{{2, 5, 1}, {1, 5, 1}, {2, 1, 10}},
                                                            std::vector<ov::element::Type>{ov::element::i8, ov::element::bf16, ov::element::f32},
                                                            std::vector<ov::element::Type>{ov::element::f32, ov::element::i8});
     function = f.getOriginal();
diff --git a/src/common/snippets/tests/src/pass/insert_load_store.cpp b/src/common/snippets/tests/src/pass/insert_load_store.cpp
index 9913225763b729..c76cc1ae3e26b1 100644
--- a/src/common/snippets/tests/src/pass/insert_load_store.cpp
+++ b/src/common/snippets/tests/src/pass/insert_load_store.cpp
@@ -25,16 +25,20 @@ std::string InsertLoadStoreTests::getTestCaseName(testing::TestParamInfo<insertL
 }
 
 void InsertLoadStoreTests::SetUp() {
-    TransformationTestsF::SetUp();
+    LoweringTests::SetUp();
     std::vector<Shape> inputShapes(3);
     std::vector<Shape> broadcastShapes(3);
     std::tie(inputShapes[0], inputShapes[1], inputShapes[2],
              broadcastShapes[0], broadcastShapes[1], broadcastShapes[2]) = this->GetParam();
-    snippets_function = std::make_shared<EltwiseThreeInputsLoweredFunction>(inputShapes, broadcastShapes);
+    snippets_function = std::make_shared<EltwiseThreeInputsLoweredFunction>(
+            std::vector<PartialShape> {inputShapes[0], inputShapes[1], inputShapes[2]}, broadcastShapes);
+    master_shape = inputShapes[0];
 }
 
 TEST_P(InsertLoadStoreTests, ThreeInputsEltwise) {
-    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal());
+    PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
+                                  master_shape[master_shape.size() - 1]});
+    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
     function = subgraph->get_body();
     function_ref = snippets_function->getLowered();
 }
diff --git a/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp b/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp
index 9be1c569a81b26..f4f3250530865c 100644
--- a/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp
+++ b/src/common/snippets/tests/src/pass/insert_movebroadcast.cpp
@@ -24,15 +24,22 @@ std::string InsertMoveBroadcastTests::getTestCaseName(testing::TestParamInfo<ins
 }
 
 void InsertMoveBroadcastTests::SetUp() {
-    TransformationTestsF::SetUp();
+    LoweringTests::SetUp();
     std::vector<Shape> inputShapes(2);
     std::vector<Shape> broadcastShapes(2);
     std::tie(inputShapes[0], inputShapes[1], broadcastShapes[0], broadcastShapes[1]) = this->GetParam();
-    snippets_function = std::make_shared<AddFunctionLoweredBroadcast>(inputShapes, broadcastShapes);
+    snippets_function = std::make_shared<AddFunctionLoweredBroadcast>(std::vector<PartialShape> {inputShapes[0], inputShapes[1]}, broadcastShapes);
+    if (inputShapes[0].size() != inputShapes[1].size())
+        IE_THROW() << "Expected input shapes of the same size";
+    master_shape = {};
+    for (int i = 0; i < inputShapes[0].size(); i++)
+        master_shape.push_back(static_cast<int64_t>(std::max(inputShapes[0][i], inputShapes[1][i])));
 }
 
 TEST_P(InsertMoveBroadcastTests, AddBroadcast) {
-    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal());
+    PartialShape scheduler_shape({master_shape[master_shape.size() - 2],
+                                  master_shape[master_shape.size() - 1]});
+    auto subgraph = getLoweredSubgraph(snippets_function->getOriginal(), scheduler_shape);
     function = subgraph->get_body();
     function_ref = snippets_function->getLowered();
 }
diff --git a/src/common/snippets/tests/src/registers.cpp b/src/common/snippets/tests/src/registers.cpp
index 2eb5cddd84fb9f..4b53f0e8092f67 100644
--- a/src/common/snippets/tests/src/registers.cpp
+++ b/src/common/snippets/tests/src/registers.cpp
@@ -33,6 +33,8 @@ TEST(TransformationTests, AssignRegisters) {
         auto s00 = std::make_shared<snippets::isa::Store>(y02); s00->set_friendly_name("y03");
         s00->set_friendly_name("s00");
         f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1});
+        // Note that testing the result is not strictly necessary, since the Result doesn't emit any code
+        f->get_result()->set_friendly_name("r00");
 
         pass::Manager m;
         m.register_pass<pass::InitNodeInfo>();
@@ -52,18 +54,19 @@ TEST(TransformationTests, AssignRegisters) {
             {"y01", 1},
             {"y02", 2},
             {"s00", 2}, // gpr
+            {"r00", 2}  // gpr
         };
 
         auto total_ops = 0;
         for (auto& op : f->get_ordered_ops()) {
-            auto& rt = op->get_rt_info();
-
-            auto it_rinfo = rt.find("reginfo");
-            if (it_rinfo != rt.end()) {
-                auto reginfo = it_rinfo->second.as<std::vector<size_t>>();
-                auto reg = reginfo[0];
-                ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
-                total_ops++;
+            for (const auto& output : op->outputs()) {
+                const auto& rt = output.get_tensor_ptr()->get_rt_info();
+                auto it_rt = rt.find("reginfo");
+                if (it_rt != rt.end()) {
+                    auto reg = it_rt->second.as<size_t>();
+                    ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
+                    total_ops++;
+                }
             }
         }
         ASSERT_EQ(total_ops, ref_registers.size());
@@ -120,6 +123,7 @@ TEST(TransformationTests, AssignRegisters2) {
         s00->set_friendly_name("s00");
 
         f = std::make_shared<Function>(NodeVector{s00}, ParameterVector{p0, p1, p2, p3, p4, p5, p6, p7});
+        f->get_result()->set_friendly_name("res00");
 
         pass::Manager m;
         m.register_pass<pass::InitNodeInfo>();
@@ -140,17 +144,19 @@ TEST(TransformationTests, AssignRegisters2) {
             {"r18", 0}, {"r19", 2}, {"r20", 4}, {"r21", 1}, {"r22", 0}, {"r23", 6},
             {"r24", 1},
             {"s00", 8},
+            {"res00", 8}
         };
 
         auto total_ops = 0;
         for (auto& op : f->get_ordered_ops()) {
-            auto& rt = op->get_rt_info();
-            auto it_rinfo = rt.find("reginfo");
-            if (it_rinfo != rt.end()) {
-                auto reginfo = it_rinfo->second.as<std::vector<size_t>>();
-                auto reg = reginfo[0];
-                ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
-                total_ops++;
+            for (const auto& output : op->outputs()) {
+                const auto& rt = output.get_tensor_ptr()->get_rt_info();
+                auto it_rt = rt.find("reginfo");
+                if (it_rt != rt.end()) {
+                    auto reg = it_rt->second.as<size_t>();
+                    ASSERT_TRUE(ref_registers[op->get_friendly_name()] == reg);
+                    total_ops++;
+                }
             }
         }
         ASSERT_EQ(total_ops, ref_registers.size());
diff --git a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
index bba788545f2087..634a9e5b38c3ea 100644
--- a/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/cpu_generator.cpp
@@ -123,8 +123,8 @@ ov::intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_
     jitters[ngraph::op::v7::Gelu::get_type_info_static()] = CREATE_EMITTER(ov::intel_cpu::jit_gelu_v7_emitter);
 
     jitters[ngraph::snippets::op::Kernel::get_type_info_static()] = CREATE_EMITTER(KernelEmitter);
-    jitters[ngraph::snippets::op::Tile::get_type_info_static()] = CREATE_EMITTER(TileEmitter);
-    jitters[ngraph::snippets::op::TileScheduler::get_type_info_static()] = CREATE_EMITTER(TileSchedulerEmitter);
+    jitters[ngraph::snippets::op::LoopBegin::get_type_info_static()] = CREATE_EMITTER(LoopBeginEmitter);
+    jitters[ngraph::snippets::op::LoopEnd::get_type_info_static()] = CREATE_EMITTER(LoopEndEmitter);
 }
 
 size_t ov::intel_cpu::CPUTargetMachine::get_lanes() const {
@@ -141,7 +141,9 @@ bool ov::intel_cpu::CPUTargetMachine::is_supported() const {
 }
 
 code ov::intel_cpu::CPUTargetMachine::get_snippet() const {
-    h->create_kernel();
+    if (h->create_kernel() != status::success) {
+        IE_THROW() << "Failed to create jit_kernel in get_snippet()";
+    }
     return h->jit_ker();
 }
 
diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
index 2130457847f3a2..3dc0a1e043d2a7 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.cpp
@@ -7,8 +7,10 @@
 #include <cpu/x64/jit_generator.hpp>
 
 #include "jit_snippets_emitters.hpp"
+#include "snippets/op/subgraph.hpp"
 
 using namespace Xbyak;
+using ngraph::snippets::op::Subgraph;
 
 namespace ov {
 namespace intel_cpu {
@@ -23,57 +25,70 @@ jit_container_emitter::jit_container_emitter(dnnl::impl::cpu::x64::jit_generator
     in_out_type_ = emitter_in_out_map::gpr_to_gpr;
 }
 
-void jit_container_emitter::map_abstract_registers(const std::vector<size_t> &vec_pool,  const std::vector<size_t> &gpr_pool,
-                                                    std::set<size_t>& vecs_used, std::set<size_t>& gprs_used) {
-    if (body.empty())
-        IE_THROW() << "Cannot map registers for jit_container_emitter when its body is empty";
-    auto abstract_to_physical = [](const std::vector<size_t>& abstract_regs, const std::vector<size_t>& regs_pool) {
+void jit_container_emitter::map_abstract_registers(mapping_info& gpr_map_pool,  mapping_info& vec_map_pool,
+                            std::vector<AllocatedEmitter>& allocated_emitters) const {
+    if (allocated_emitters.empty())
+        IE_THROW() << "Cannot map registers when there is no allocated_emitters provided";
+    auto map_regs = [](const std::vector<size_t>& abstract_regs, mapping_info& mapping) {
+        auto& abstract_to_physical = mapping.first;
+        auto& regs_pool = mapping.second;
         std::vector<size_t> physical_regs(abstract_regs.size());
-        for (size_t i = 0; i < abstract_regs.size(); i++)
-            physical_regs[i] = regs_pool.at(abstract_regs[i]);
+        for (size_t i = 0; i < abstract_regs.size(); i++) {
+            const auto abstract = abstract_regs[i];
+            auto& physical = physical_regs[i];
+            if (abstract_to_physical.count(abstract) == 0) {
+                if (regs_pool.empty())
+                    IE_THROW() << "Cannot map registers for jit_container_emitter: not enough regs in the pool";
+                physical = regs_pool.back();
+                regs_pool.pop_back();
+                abstract_to_physical[abstract] = physical;
+            } else {
+                physical = abstract_to_physical[abstract];
+            }
+        }
         return physical_regs;
     };
-    for (auto& code : body) {
+
+    for (auto& code : allocated_emitters) {
         const auto& emitter = code.first;
         std::vector<size_t> in_abstract_regs, out_abstract_regs;
         std::tie(in_abstract_regs, out_abstract_regs) = code.second;
         std::vector<size_t> in_physical_regs, out_physical_regs;
         switch (std::dynamic_pointer_cast<jit_emitter>(emitter)->get_in_out_type()) {
             case gpr_to_gpr:
-                // Note that gpr_to_gpr is used for high-level utility operations like Kernel/TileScheduler/Tile.
+                // Note that gpr_to_gpr is used for high-level utility operations like Kernel/Loop.
                 // Input registers are not mapped in this case, since they contain utility info
-                // (num_params, tile increment, etc.), but not reg indexes.
-                in_physical_regs = std::move(in_abstract_regs);
-                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool));
-                gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                // (num_params, loop increment, etc.), but not reg indexes.
+                // todo: Note that LoopBeginEmitter and LoopEndEmitter demonstrate new paradigm,
+                //  where all utility emitters align with conventional Op emitters
+                if (std::dynamic_pointer_cast<LoopBeginEmitter>(emitter) ||
+                        std::dynamic_pointer_cast<LoopEndEmitter>(emitter))
+                    in_physical_regs = std::move(map_regs(in_abstract_regs, gpr_map_pool));
+                else
+                    in_physical_regs = std::move(in_abstract_regs);
+                out_physical_regs = std::move(map_regs(out_abstract_regs, gpr_map_pool));
                 break;
             case gpr_to_vec:
                 // Load Emitters
-                in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, gpr_pool));
-                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool));
-                gprs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
-                vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                in_physical_regs = std::move(map_regs(in_abstract_regs, gpr_map_pool));
+                out_physical_regs = std::move(map_regs(out_abstract_regs, vec_map_pool));
                 break;
             case vec_to_gpr:
                 // Store Emitters
-                in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool));
-                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, gpr_pool));
-                vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
-                gprs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                in_physical_regs = std::move(map_regs(in_abstract_regs, vec_map_pool));
+                out_physical_regs = std::move(map_regs(out_abstract_regs, gpr_map_pool));
                 break;
             case vec_to_vec:
                 // Regular operations
-                in_physical_regs = std::move(abstract_to_physical(in_abstract_regs, vec_pool));
-                out_physical_regs = std::move(abstract_to_physical(out_abstract_regs, vec_pool));
-                vecs_used.insert(in_physical_regs.begin(), in_physical_regs.end());
-                vecs_used.insert(out_physical_regs.begin(), out_physical_regs.end());
+                in_physical_regs = std::move(map_regs(in_abstract_regs, vec_map_pool));
+                out_physical_regs = std::move(map_regs(out_abstract_regs, vec_map_pool));
                 break;
             default:
                 IE_THROW() << "Unhandled in_out type";
         }
         code.second = std::make_pair(in_physical_regs, out_physical_regs);
         if (auto container = std::dynamic_pointer_cast<jit_container_emitter>(code.first))
-            container->map_abstract_registers(vec_pool, gpr_pool, vecs_used, gprs_used);
+            container->map_abstract_registers(gpr_map_pool,  vec_map_pool, allocated_emitters);
     }
 }
 
@@ -84,15 +99,18 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl:
         IE_THROW() << "KernelEmitter invoked with invalid op argument";
     if (kernel->region.empty())
         IE_THROW() << "KernelEmitter invoked with empty body";
+    if (kernel->compile_params == nullptr)
+        IE_THROW() << "KernelEmitter invoked with op::Kernel that contains no compile_params";
     body = kernel->region;
-    if (!kernel->compile_params)
-        IE_THROW() << "KernelEmitter invoked without compile_params";
     jcp = *reinterpret_cast<const jit_snippets_compile_args*>(kernel->compile_params);
     // Initialize pools of gp and vec registers
     gp_regs_pool.resize(16);
     vec_regs_pool.resize(16);
-    std::iota(gp_regs_pool.begin(), gp_regs_pool.end(), 0);
-    std::iota(vec_regs_pool.begin(), vec_regs_pool.end(), 0);
+    // It's easier to remove the last item during mapping, so fill descending to map ascending
+    for (size_t i = 0; i < 16; i++)
+        gp_regs_pool[i] = vec_regs_pool[i] = 15 - i;
+    // todo: it's more convenient to use std::set as a pool container (unique and always sorted),
+    //  but pools are vectors to align with emit_code signature. Change signature?
     auto remove_regs_from_pool = [](std::vector<size_t>& pool, const std::set<size_t>& to_remove) {
         // It's important to keep the order of other elements
         pool.erase(std::remove_if(pool.begin(), pool.end(),
@@ -101,14 +119,27 @@ KernelEmitter::KernelEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl:
     // Reserve stack base and pointer for push(...) and pop(...) operations
     // Reserve abi_param1 and abi_param2, since they'll be used to pass runtime call args to kernel
     remove_regs_from_pool(gp_regs_pool, {Xbyak::Operand::RSP, Xbyak::Operand::RBP,
-                                         static_cast<size_t>(abi_param1.getIdx()),
-                                         static_cast<size_t>(abi_param2.getIdx())});
-    std::set<size_t> vecs_used, gprs_used;
-    map_abstract_registers(vec_regs_pool, gp_regs_pool, vecs_used, gprs_used);
-    remove_regs_from_pool(gp_regs_pool, gprs_used);
-    remove_regs_from_pool(vec_regs_pool, vecs_used);
-    // Remember used gprs to pass it to the TileSchedulerEmitter, so it can init them with appropriate data ptrs
-    gp_regs_used = std::vector<size_t>(gprs_used.begin(), gprs_used.end());
+                                         reg_indexes_idx, reg_const_params_idx});
+
+    mapping_info gpr_map_pool({}, gp_regs_pool);
+    mapping_info vec_map_pool({}, vec_regs_pool);
+    std::vector<AllocatedEmitter> data_io_emitters;
+    std::copy_if(body.begin(), body.end(), std::back_inserter(data_io_emitters),
+                           [](const AllocatedEmitter& code){
+                                   const auto& emitter = code.first;
+                                   const auto emitter_type = std::dynamic_pointer_cast<jit_emitter>(emitter)->get_in_out_type();
+                                   return emitter_type == gpr_to_vec || emitter_type == vec_to_gpr;
+                           });
+    // Note that we can't use reg_indexes_idx or reg_const_params_idx to store data pointers because these two
+    // regs are used to calculate offsets for the data pointers
+    map_abstract_registers(gpr_map_pool, vec_map_pool, data_io_emitters);
+    for (const auto& abstract_to_physical : gpr_map_pool.first)
+        data_ptr_regs_idx.push_back(abstract_to_physical.second);
+    // However we can use reg_indexes_idx and reg_const_params_idx for other operations since we won't need them
+    // after offsets calculation
+    gpr_map_pool.second.push_back(reg_indexes_idx);
+    gpr_map_pool.second.push_back(reg_const_params_idx);
+    map_abstract_registers(gpr_map_pool, vec_map_pool, body);
 }
 
 void KernelEmitter::emit_code(const std::vector<size_t> &in,
@@ -126,263 +157,211 @@ void KernelEmitter::validate_arguments(const std::vector<size_t> &in,
     if (in.size() != 2)
         IE_THROW() << "KernelEmitter got invalid number of inputs. Expected 2, got " << in.size();
     if (!out.empty())
-        IE_THROW() << "KKernelEmitter got invalid number of outputs. Expected 0, got " << out.size();
+        IE_THROW() << "KernelEmitter got invalid number of outputs. Expected 0, got " << out.size();
+    const auto num_params = in[0] + in[1];
+    // The number of used gpr may be >= num_params since LoopBegin+LoopEnd could also use gpr to store work_amount
+    if (data_ptr_regs_idx.size() != num_params)
+        IE_THROW() << "KernelEmitter arguments are inconsistent with the gpr_regs_used size: in[0] + in[1] = "
+        << num_params << " data_ptr_regs_idx.size() = " << data_ptr_regs_idx.size();
 }
 
 void KernelEmitter::init_data_pointers(size_t num_inputs, size_t num_params,
                                               const Reg64& reg_indexes, const Reg64& reg_const_params, const std::vector<Reg64>& data_ptr_regs) const {
-    const int64_t harness_num_dims = jcp.output_dims.size() - 1;
-    auto init_ptrs_with_offsets = [&](Reg64 pointer, const int64_t *offsets, Reg64 reg_tmp) {
-        for (int j = 0; j < harness_num_dims; j++) {
-            if (jcp.output_dims[j] != 1 && offsets[j] != 0) {
+    // master_shape size must be valid in both static and dynamic cases
+    const int64_t offsetRank = jcp.master_shape.size() - 1;
+    std::function<void(Reg64, size_t, Reg64)> init_ptr_with_offset;
+    init_ptr_with_offset = [&](Reg64 pointer, size_t offset_start_index, Reg64 reg_tmp) {
+        const int64_t *offsets =  jcp.data_offsets + offset_start_index;
+        for (int j = 0; j < offsetRank; j++) {
+            if (jcp.master_shape[j] != 1 && offsets[j] != 0) {
                 h->mov(reg_tmp, offsets[j]);
                 h->imul(reg_tmp, h->ptr[reg_indexes + j * sizeof(size_t)]);
                 h->add(pointer, reg_tmp);
             }
         }
     };
-    for (auto i = 0; i < num_params; i++) {
+    const auto spare_corruptable_gpr = std::find_if(gp_regs_pool.begin(), gp_regs_pool.end(),
+                                                   [this](size_t reg) {
+                                                        return reg != reg_indexes_idx && reg != reg_const_params_idx;
+                                                   });
+    const bool last_iter_explicitly = spare_corruptable_gpr == gp_regs_pool.end();
+    Reg64 reg_tmp = last_iter_explicitly ? data_ptr_regs.back() : Reg64(static_cast<int>(*spare_corruptable_gpr));
+    size_t i = 0;
+    for (; i < num_params - last_iter_explicitly; i++) {
         if (i < num_inputs)
             h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(src_ptrs) + i * sizeof(void*)]);
         else
             h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]);
-        // we can use the last data_ptr_reg as tmp_reg until the last iteration, and reg_const_params then
-        Reg64 reg_tmp = i < num_params-1 ? data_ptr_regs.back() : reg_const_params;
-        init_ptrs_with_offsets(data_ptr_regs[i], &jcp.data_offsets[i * harness_num_dims], reg_tmp);
+        init_ptr_with_offset(data_ptr_regs[i], i * offsetRank, reg_tmp);
+    }
+    // a rare case when num_params is maximal, so we have no spare gprs
+    // * Static case: we can use reg_const_params as the last reg_tmp for the last iteration (and corrupt it), since
+    //     it won't be used anymore
+    // * Dynamic case: we will need reg_const_params to pass runtime args to LoopScheduler, so we have to
+    //     push a reg on the stack, and restore it value afterwards
+    if (last_iter_explicitly) {
+        h->mov(data_ptr_regs[i], h->ptr[reg_const_params + GET_OFF(dst_ptrs) + (i - num_inputs) * sizeof(void*)]);
+        reg_tmp = reg_const_params;
+        // can corrupt reg_const_params, since we won't use it anymore
+        init_ptr_with_offset(data_ptr_regs[i], i * offsetRank, reg_tmp);
     }
 }
 void KernelEmitter::emit_impl(const std::vector<size_t>& in,
                               const std::vector<size_t>& out,
-                              const std::vector<size_t>& allocated_vec_regs,
-                              const std::vector<size_t>& allocated_gp_regs,
+                              const std::vector<size_t>& vec_pool,
+                              const std::vector<size_t>& gpr_pool,
                               const ov::intel_cpu::emitter_context *emit_context) const {
     h->preamble();
 
     const size_t num_inputs = in[0];
     const size_t num_outputs = in[1];
 
-    Reg64 reg_indexes = Reg64(abi_param1.getIdx());
-    Reg64 reg_const_params = Reg64(abi_param2.getIdx());
+    Reg64 reg_indexes = Reg64(static_cast<int>(reg_indexes_idx));
+    Reg64 reg_const_params = Reg64(static_cast<int>(reg_const_params_idx));
     std::vector<Reg64> data_ptr_regs;
-    transform_idxs_to_regs(gp_regs_used, data_ptr_regs);
+    transform_idxs_to_regs(data_ptr_regs_idx, data_ptr_regs);
 
     init_data_pointers(num_inputs, num_inputs + num_outputs, reg_indexes, reg_const_params, data_ptr_regs);
-    // todo: emit_impl is a const method, so we can't just push_back unused regs to the gp_regs_pool.
-    //  we need a more elegant approach to avoid a full copy here
-    auto local_gpr_pool = gp_regs_pool;
-    local_gpr_pool.push_back(static_cast<size_t>(reg_indexes.getIdx()));
-    local_gpr_pool.push_back(static_cast<size_t>(reg_const_params.getIdx()));
     for (const auto& c : body) {
         const auto& emitter = c.first;
         std::vector<size_t> in_regs, out_regs;
         std::tie(in_regs, out_regs) = c.second;
-        if (auto tile_scheduler = std::dynamic_pointer_cast<TileSchedulerEmitter>(emitter))
-            out_regs = gp_regs_used;
-        emitter->emit_code(in_regs, out_regs, vec_regs_pool, local_gpr_pool);
+        emitter->emit_code(in_regs, out_regs, vec_regs_pool, gp_regs_pool);
     }
     h->postamble();
 }
 
-TileSchedulerEmitter::TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
-                                           const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
-    const auto tile_scheduler = ov::as_type_ptr<ngraph::snippets::op::TileScheduler>(n);
-    if (!tile_scheduler)
-        IE_THROW() << "TileSchedulerEmitter invoked with invalid op argument";
-    if (!tile_scheduler->compile_params)
-        IE_THROW() << "TileEmitter invoked without compile_params";
-    body = {tile_scheduler->vector_region, tile_scheduler->scalar_region};
-    jcp = *reinterpret_cast<const jit_snippets_compile_args*>(tile_scheduler->compile_params);
-}
-void TileSchedulerEmitter::emit_code(const std::vector<size_t> &in,
-                                     const std::vector<size_t> &out,
-                                     const std::vector<size_t> &pool,
-                                     const std::vector<size_t> &gpr) const {
+
+LoopBeginEmitter::LoopBeginEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                         const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
+    loop_begin = ov::as_type_ptr<ngraph::snippets::op::LoopBegin>(n);
+    if (!loop_begin)
+        IE_THROW() << "LoopBeginEmitter invoked with invalid op argument";
+    const auto& target_inputs = loop_begin->output(loop_begin->get_output_size() - 1).get_target_inputs();
+    // todo: this check could be excessive, since we check for it in validate_and_infer_types()
+    if (target_inputs.size() != 1)
+        IE_THROW() << "LoopBeginEmitter invoked with invalid configuration: the last output must have exactly one input attached";
+    const auto loop_end = ov::as_type_ptr<ngraph::snippets::op::LoopEnd>(target_inputs.begin()->get_node()->shared_from_this());
+    if (!loop_end)
+        IE_THROW() << "LoopBeginEmitter invoked with invalid configuration: the last output must be LoopEnd";
+    work_amount = loop_begin->get_work_amount();
+    evaluate_once = loop_begin->get_evaluate_once();
+    num_inputs = loop_begin->get_input_size();
+    in_out_type_ = emitter_in_out_map::gpr_to_gpr;
+}
+
+void LoopBeginEmitter::emit_code(const std::vector<size_t> &in,
+                                 const std::vector<size_t> &out,
+                                 const std::vector<size_t> &pool,
+                                 const std::vector<size_t> &gpr) const {
     validate_arguments(in, out, pool, gpr);
     emit_impl(in, out, pool, gpr, nullptr);
 }
-void TileSchedulerEmitter::validate_arguments(const std::vector<size_t> &in,
-                                     const std::vector<size_t> &out,
-                                     const std::vector<size_t> &pool,
-                                     const std::vector<size_t> &gpr) const {
-    if (in.size() != 3)
-        IE_THROW() << "TileSchedulerEmitter got invalid number of inputs. Expected 3, got " << in.size();
-    if (out.size() != in[0] + in[1])
-        IE_THROW() << "TileSchedulerEmitter got invalid number of outputs. Expected " << in[0] + in[1] << " , got " << out.size();
-    if (body.size() != 2)
-        IE_THROW() << "TileSchedulerEmitter got invalid body size, expected 2 (vector & scalar TileEmitter), got " << body.size();
-    if (!(std::dynamic_pointer_cast<TileEmitter>(body[0].first) && std::dynamic_pointer_cast<TileEmitter>(body[1].first)))
-        IE_THROW() << "TileSchedulerEmitter can contain only TileEmitters inside its body";
-}
-
-void TileSchedulerEmitter::emit_tiles(const Reg64& reg_inner_amount, const std::vector<Reg64>& data_ptr_regs, size_t vector_size,
-                                      const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const {
-    // TileAllocatedEmitter is just an alias to perform dynamic_pointer_cast only once and reuse it below several times
-    using TileAllocatedEmitter = std::pair<std::shared_ptr<TileEmitter>, const ngraph::snippets::RegInfo&>;
-    TileAllocatedEmitter vector_tile {std::dynamic_pointer_cast<TileEmitter>(body[0].first), body[0].second};
-    TileAllocatedEmitter scalar_tile {std::dynamic_pointer_cast<TileEmitter>(body[1].first), body[1].second};
-    const size_t inner_work_amount = jcp.scheduler_dims[1];
-    auto process_tile =
-        [&](const bool evaluate_once, const TileAllocatedEmitter& tile) {
-            // If Tile is evaluated only once, then we can emit its body directly and skip work_amount decrements and checks
-            if (evaluate_once) {
-                tile.first->emit_body(vec_pool, gpr_pool);
-            } else {
-                std::vector<size_t> in_regs, out_regs;
-                std::tie(in_regs, out_regs) = tile.second;
-                // pass work_amount reg to Tile
-                in_regs.push_back(static_cast<size_t>(reg_inner_amount.getIdx()));
-                for (const auto& reg : data_ptr_regs)
-                    out_regs.emplace_back(reg.getIdx());
-                tile.first->emit_code(in_regs, out_regs, vec_pool, gpr_pool);
-            }
-        };
-    // todo: these optimizations should be performed on using Tile graph representation in the future
-    bool vector_evaluate_once = false;
-    if (inner_work_amount >= vector_size) {
-        vector_evaluate_once = inner_work_amount < 2 * vector_size;
-        // Need to set proper work amount for inner tiles if evaluated multiple times
-        if (!vector_evaluate_once)
-            h->mov(reg_inner_amount, inner_work_amount);
-        process_tile(vector_evaluate_once, vector_tile);
-    }
-    if (inner_work_amount % vector_size >= 1) {
-        bool scalar_evaluate_once = inner_work_amount % vector_size < 2;
-        if (!scalar_evaluate_once) {
-            // vector_tile is not executed, work_amount is not set
-            if (inner_work_amount < vector_size) {
-                h->mov(reg_inner_amount, inner_work_amount);
-                // vector_tile is executed, but work_amount is neither set nor decremented appropriately.
-            } else if (vector_evaluate_once) {
-                vector_tile.first->emit_ptr_increments(data_ptr_regs);
-                h->mov(reg_inner_amount, inner_work_amount - vector_size);
-            }
-            // else: vector_tile is executed multiple times, so work_amount is already set
-        } else {
-            if (vector_evaluate_once) {
-                vector_tile.first->emit_ptr_increments(data_ptr_regs);
-            }
-        }
-        process_tile(scalar_evaluate_once, scalar_tile);
-    }
+
+void LoopBeginEmitter::validate_arguments(const std::vector<size_t> &in,
+                                        const std::vector<size_t> &out,
+                                        const std::vector<size_t> &pool,
+                                        const std::vector<size_t> &gpr) const {
+    if (in.size() != num_inputs)
+        IE_THROW() << "Invalid inputs size: expected " << num_inputs << " got " << in.size();
+    if (out.size() != num_inputs + 1)
+        IE_THROW() << "Invalid outputs size: expected " << num_inputs + 1 << " got " << out.size();
 }
 
-void TileSchedulerEmitter::emit_impl(const std::vector<size_t>& in,
-                                     const std::vector<size_t>& out,
-                                     const std::vector<size_t>& vec_pool,
-                                     const std::vector<size_t>& gpr_pool,
-                                     const ov::intel_cpu::emitter_context *emit_context) const {
-    const size_t num_inputs = in[0];
-    const size_t num_outputs = in[1];
-    const size_t vector_size = in[2];
-    const size_t num_params = num_inputs + num_outputs;
-    const auto& data_ptr_reg_idxs(out);
-    std::vector<Reg64> data_ptr_regs;
-    transform_idxs_to_regs(data_ptr_reg_idxs, data_ptr_regs);
-    // todo: emit_impl has const input args, so we can't just pop_back necessary regs from gpr_pool.
-    //  we need a more elegant approach to avoid a full copy here. Similar problem is demonstrated in KernelEmitter
-    auto local_gpr_pool = gpr_pool;
-    Reg64 reg_outer_amount = Reg64(static_cast<int>(local_gpr_pool.back()));
-    local_gpr_pool.pop_back();
-    Reg64 reg_inner_amount = Reg64(static_cast<int>(local_gpr_pool.back()));
-    local_gpr_pool.pop_back();
+void LoopBeginEmitter::emit_impl(const std::vector<size_t>& in,
+                                 const std::vector<size_t>& out,
+                                 const std::vector<size_t>& pool,
+                                 const std::vector<size_t>& gpr,
+                                 const ov::intel_cpu::emitter_context *emit_context) const {
+    // todo: In dynamic case we will also need to set broadcasting info here
+    Reg64 reg_work_amount = Reg64(out.back());
     Label for_body;
-    const size_t outer_work_amount = jcp.scheduler_dims[0];
-    if (outer_work_amount == 1) {
-        // emit code directly without looping over external dim
-        emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool);
-    } else if (outer_work_amount > 1) {
-        // We need to create a Loop in this case
-        h->mov(reg_outer_amount, outer_work_amount);
-        h->L(for_body);
-        {
-            emit_tiles(reg_inner_amount, data_ptr_regs, vector_size, vec_pool, local_gpr_pool);
-
-            // Todo: Load and Store emitters are currently implemented so they ALWAYS increment appropriate pointers
-            //   after reading/writing. This might be a problem if we need to read the same data multiple times (broadcasting shapes).
-            //   To overcome this limitation, we add appropriate negative offsets if necessary.
-            for (auto i = 0; i < num_params; i++) {
-                if (jcp.scheduler_offsets[i] != 0) {
-                    h->add(data_ptr_regs[i], jcp.scheduler_offsets[i]);
-                }
-            }
-            // Note that outer dimensions are always incremented by 1 (outer tiles are always scalar)
-            h->sub(reg_outer_amount, 1);
-            h->cmp(reg_outer_amount, 1);
-            h->jge(for_body, CodeGenerator::T_NEAR);
-        }
+    // save previous register state (if there is an outer loop that uses this reg for example)
+    if (!evaluate_once) {
+        h->mov(reg_work_amount, work_amount);
     }
+    // Note: loop address is not calculated at this point, so need to call calcJmpAddress() which is protected
+    // or ready(), but they both set internal flags and that's not a desired way to use them.
+    // So the most obvious WA is just to use current address manually
+    loop_begin->begin_address = h->getCurr();
+    loop_begin->input_regs = in;
+}
+
+LoopEndEmitter::LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
+                                   const std::shared_ptr<ov::Node>& n) : jit_emitter(h, isa, n) {
+    loop_end = ov::as_type_ptr<ngraph::snippets::op::LoopEnd>(n);
+    if (!loop_end)
+        IE_THROW() << "LoopEndEmitter invoked with invalid op argument";
+    loop_begin = loop_end->get_loop_begin();
+    // todo: this check could be excessive, since we check for it in validate_and_infer_types()
+    if (!loop_begin)
+        IE_THROW() << "LoopEndEmitter invoked with invalid configuration: the last arg must be LoopBegin";
+    // Note that 1 edge connects LoopBegin and LoopEnd
+    num_inputs = loop_begin->get_input_size();
+    num_outputs = loop_end->get_output_size();
+    increment = loop_end->get_increment();
+    work_amount = loop_end->get_work_amount();
+    apply_increments = loop_end->get_apply_increment();
+    finalization_offsets = loop_end->get_finalization_offsets();
+    evaluate_once = loop_end->get_evaluate_once();
+    for (int i = 0; i < num_inputs; i++)
+        io_data_size.push_back(loop_begin->get_input_element_type(i).size());
+    for (int i = 0; i < num_outputs; i++)
+        io_data_size.push_back(loop_end->get_input_element_type(i).size());
+    in_out_type_ = emitter_in_out_map::gpr_to_gpr;
 }
 
-std::vector<AllocatedEmitter>& TileEmitter::get_nested_code() {
-    return body;
-}
-
-TileEmitter::TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
-                         const std::shared_ptr<ov::Node>& n) : jit_container_emitter(h, isa, n) {
-    const auto tile = ov::as_type_ptr<ngraph::snippets::op::Tile>(n);
-    if (!tile)
-        IE_THROW() << "TileEmitter invoked with invalid op argument";
-    body = tile->region;
-    if (body.empty())
-        IE_THROW() << "TileEmitter is invoked with empty body";
-    num_inputs = tile->num_inputs;
-    num_outputs = tile->num_outputs;
-    io_dims = tile->io_dims;
-    io_data_size = tile->io_data_size;
-    increment = tile->increment;
-    if (io_dims.size() != num_inputs + num_outputs)
-        IE_THROW() << "TileEmitter constructor got inconsistent arguments. Check num_inputs + num_outputs == io_dims.size()";
-}
-
-void TileEmitter::emit_code(const std::vector<size_t> &in,
-                            const std::vector<size_t> &out,
-                            const std::vector<size_t> &pool,
-                            const std::vector<size_t> &gpr) const {
+void LoopEndEmitter::emit_code(const std::vector<size_t> &in,
+                                 const std::vector<size_t> &out,
+                                 const std::vector<size_t> &pool,
+                                 const std::vector<size_t> &gpr) const {
     validate_arguments(in, out, pool, gpr);
     emit_impl(in, out, pool, gpr, nullptr);
 }
 
-void TileEmitter::validate_arguments(const std::vector<size_t> &in,
-                                     const std::vector<size_t> &out,
-                                     const std::vector<size_t> &pool,
-                                     const std::vector<size_t> &gpr) const {
-    if (in.size() != 1)
-        IE_THROW() << "TileEmitter got invalid number of inputs. Expected 1, got " << in.size();
-    if (out.size() != io_dims.size())
-        IE_THROW() << "TileEmitter got invalid number of outputs. Expected " << io_dims.size() << " , got " << out.size();
-}
 
-void TileEmitter::emit_body(const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const {
-    for (auto& code : body)
-        code.first->emit_code(code.second.first, code.second.second, vec_pool, gpr_pool);
-}
-
-void TileEmitter::emit_ptr_increments(const std::vector<Reg64>& data_ptr_regs) const {
-    for (size_t i = 0; i < num_inputs + num_outputs; i++) {
-        // those with dims == 1 will be broadcasted, hence don't require increment
-        if (io_dims[i] != 1)
-            h->add(data_ptr_regs[i], increment * io_data_size[i]);
+void LoopEndEmitter::validate_arguments(const std::vector<size_t> &in,
+                                       const std::vector<size_t> &out,
+                                       const std::vector<size_t> &pool,
+                                       const std::vector<size_t> &gpr) const {
+    if (loop_begin->input_regs.size() != num_inputs)
+        IE_THROW() << "Invalid loop_begin->input_regs size: expected " << num_inputs << " got " << loop_begin->input_regs.size();
+    if (out.size() != num_outputs)
+        IE_THROW() << "Invalid number of out arguments: expected " << num_outputs << " got " << out.size();
+    if (in.size() != num_outputs + 1)
+        IE_THROW() << "Invalid number of in arguments: expected " << num_inputs + 1 << " got " << in.size();
+    const auto io_size = num_inputs + num_outputs;
+    if (apply_increments.size() != io_size)
+        IE_THROW() << "Invalid apply_increments size: expected " << io_size << " got " << apply_increments.size();
+    if (finalization_offsets.size() != io_size)
+        IE_THROW() << "Invalid finalization_offsets size: expected: " << io_size << " got " << finalization_offsets.size();
+}
+
+void LoopEndEmitter::emit_impl(const std::vector<size_t>& in,
+                                 const std::vector<size_t>& out,
+                                 const std::vector<size_t>& pool,
+                                 const std::vector<size_t>& gpr,
+                                 const ov::intel_cpu::emitter_context *emit_context) const {
+    std::vector<size_t> data_ptr_reg_idxs(loop_begin->input_regs);
+    data_ptr_reg_idxs.reserve(num_inputs + num_outputs);
+    std::copy(out.begin(), out.end(), std::back_inserter(data_ptr_reg_idxs));
+    std::vector<Reg64> data_ptr_regs;
+    transform_idxs_to_regs(data_ptr_reg_idxs, data_ptr_regs);
+    Reg64 reg_work_amount = Reg64(in.back());
+    if (!evaluate_once) {
+        for (int idx = 0; idx < data_ptr_regs.size(); idx++) {
+            if (apply_increments[idx])
+                h->add(data_ptr_regs[idx], increment * io_data_size[idx]);
+        }
+        h->sub(reg_work_amount, increment);
+        h->cmp(reg_work_amount, increment);
+        h->jge(loop_begin->begin_address);
     }
-}
 
-void TileEmitter::emit_impl(const std::vector<size_t>& in,
-                            const std::vector<size_t>& out,
-                            const std::vector<size_t>& vec_pool,
-                            const std::vector<size_t>& gpr_pool,
-                            const ov::intel_cpu::emitter_context *emit_context) const {
-    Reg64 work_amount = Reg64(static_cast<int>(in[0]));
-    std::vector<Reg64> data_ptr_regs;
-    transform_idxs_to_regs(out, data_ptr_regs);
-    Label for_body;
-    // Note that:
-    // * Work amount must be set by TileScheduler that executes Tiles
-    // * TileScheduler executes Tile only if it has to perform >= 1 iterations
-    h->L(for_body);
-    emit_body(vec_pool, gpr_pool);
-    emit_ptr_increments(data_ptr_regs);
-    h->sub(work_amount, increment);
-    h->cmp(work_amount, increment);
-    h->jge(for_body, CodeGenerator::T_NEAR);
+    for (int idx = 0; idx < data_ptr_regs.size(); idx++) {
+        if (finalization_offsets[idx] != 0)
+            h->add(data_ptr_regs[idx], finalization_offsets[idx] * io_data_size[idx]);
+    }
 }
 
 BroadcastMoveEmitter::BroadcastMoveEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
diff --git a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
index adfd88dfeddff6..7fa1b8f1aa958d 100644
--- a/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
+++ b/src/plugins/intel_cpu/src/emitters/jit_snippets_emitters.hpp
@@ -23,6 +23,7 @@ namespace intel_cpu {
 #define SNIPPETS_MAX_SNIPPETS_DIMS 12
 #define SNIPPETS_MAX_HARNESS_DIMS 5
 #define SNIPPETS_MAX_TILE_RANK 2
+#define SNIPPETS_DYNAMIC_MASTER_SHAPE_RANK 6
 #define GET_OFF(field) offsetof(jit_snippets_call_args, field)
 struct jit_snippets_call_args {
     const void *src_ptrs[SNIPPETS_MAX_SNIPPETS_DIMS] = {};
@@ -30,40 +31,40 @@ struct jit_snippets_call_args {
 };
 
 struct jit_snippets_compile_args {
-    int64_t scheduler_dims[SNIPPETS_MAX_TILE_RANK] = {};
-    int64_t scheduler_offsets[SNIPPETS_MAX_SNIPPETS_DIMS] = {};
+    std::vector<size_t> master_shape{};
     int64_t data_offsets[SNIPPETS_MAX_SNIPPETS_DIMS * SNIPPETS_MAX_HARNESS_DIMS] = {};
-    std::vector<size_t> output_dims = {};
 };
 ///
-/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (presently KernelEmitter,
-/// TileSchedulerEmitter and TileEmitter). This is needed to provide common interface for register mapping
+/// \brief jit_container_emitter designed to wrap Emitters that contain other Emitters (for example, KernelEmitter)
+///  This is needed to provide common interface for register mapping
 /// (abstract to physical) and nested code access.
 ///
 class jit_container_emitter: public jit_emitter {
 public:
     jit_container_emitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
                           const std::shared_ptr<ov::Node>& n);
+    // mapping info contains abstract_to_physical map + regs_pool
+    using mapping_info = std::pair<std::map<size_t, size_t>, std::vector<size_t>&>;
 protected:
     // maps gpr and vec abstract registers to physical ones. Physical reg indexes are taken from the provided pools
     // (the first 2 args). All the used gpr and vec registers are also stored in the provided sets (the second 2 args).
-    void map_abstract_registers(const std::vector<size_t>&,  const std::vector<size_t>&,
-                                std::set<size_t>&, std::set<size_t>&);
+    void map_abstract_registers(mapping_info& gpr_map_pool,  mapping_info& vec_map_pool,
+                                std::vector<AllocatedEmitter>& allocated_emitters) const;
     std::vector<AllocatedEmitter> body;
 };
 ///
 /// \brief    Kernel is the only entry point to Codogen Jit compilation. Kernel perform abstract-to-physical register
-/// mapping and creates pools of available gpr and vec registers. Kernel is expected to contain (at least one)
-/// TileSchedulerEmitter. In general the enclosed emitters should be organized in the following way:
-/// KernelEmitter {          /* entry point, maps registers, creates pools of available registers */
-///     TileSchedulerEmitter { /* executes required inner, avoids emitting code that won't be executed */
-///         TileEmitter {    /* inner vector tile */
-///             ...          /* All the necessary Load/Strore/elementwise emitters */
-///         }
-///         TileEmitter {    /* inner scalar tile for tail processing */
-///             ...          /* All the necessary Load/Strore/elementwise emitters */
-///         }
-///     }
+/// mapping and creates pools of available gpr and vec registers. Kernel usually to contains (at least one)
+/// LoopBeginEmitter and LoopEndEmitter pair. In general the enclosed emitters should be organized in the following way:
+/// KernelEmitter {                 /* entry point, maps registers, creates pools of available registers */
+///     1.S LoopBeginEmitter        /* Scalar Loop over the outer dimension [START] */
+///         2.S LoopBeginEmitter    /* inner vector loop [START] */
+///             ...                 /* All the necessary Load/Strore/elementwise emitters */
+///         2.E LoopEndEmitter      /* inner vector loop [END] */
+///         3.S LoopBeginEmitter    /* inner scalar loop for tail processing [START]*/
+///             ...                 /* All the necessary Load/Strore/elementwise emitters */
+///         3.E LoopEndEmitter      /* inner scalar loop for tail processing [END]*/
+///     1.E LoopEndEmitter          /* Scalar Loop over the outer dimension [END] */
 /// }
 /// Note that Kernel doesn't accept any input arguments.
 ///
@@ -92,29 +93,22 @@ class KernelEmitter : public jit_container_emitter {
 
     jit_snippets_compile_args jcp;
     std::vector<size_t> gp_regs_pool;
-    std::vector<size_t> gp_regs_used;
+    // gpr's used to store data pointers, track them to apply offsets in Kernel
+    std::vector<size_t> data_ptr_regs_idx;
     std::vector<size_t> vec_regs_pool;
+    const size_t reg_indexes_idx = abi_param1.getIdx();
+    const size_t reg_const_params_idx = abi_param2.getIdx();
 };
-///
-/// \brief  TileSchedulerEmitter contains Tiles to be executed (presently vector and scalar). It calculates data offsets
-/// and work amounts, performs data pointer decrements if necessary. It also performs some Tile optimizations: scalar/vector
-/// tiles are emitted only if necessary; Tile body could be emitted directly, if only one Tile evaluation is required.
-///
-/// \param      in[0]      The number of the node inputs
-/// \param      in[1]      The number of the node outputs
-/// \param      in[2]      The number of elements that fits into vector register
-///
 
-class TileSchedulerEmitter : public jit_container_emitter {
+class LoopBeginEmitter : public jit_emitter {
 public:
-    TileSchedulerEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa,
-                         const std::shared_ptr<ov::Node>& n);
-
-    size_t get_inputs_num() const override {return 0;}
+    LoopBeginEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
     void emit_code(const std::vector<size_t> &in,
                    const std::vector<size_t> &out,
                    const std::vector<size_t> &pool,
                    const std::vector<size_t> &gpr) const override;
+    // todo: it is purely virtual in the base class, but do we need it?
+    size_t get_inputs_num() const override {return 0;}
 
 private:
     void validate_arguments(const std::vector<size_t> &in,
@@ -127,50 +121,48 @@ class TileSchedulerEmitter : public jit_container_emitter {
                    const std::vector<size_t>& gpr,
                    const ov::intel_cpu::emitter_context *emit_context) const override;
 
-    void emit_tiles(const Reg64&, const std::vector<Reg64>&, size_t, const std::vector<size_t>& , const std::vector<size_t>&) const;
-
-    jit_snippets_compile_args jcp;
+    std::shared_ptr<ngraph::snippets::op::LoopBegin> loop_begin;
+    size_t num_inputs = 0;
+    bool evaluate_once = false;
+    size_t work_amount = 0; // need to store work_amount explicitly, since two loops can work on the same dim (e.g. vector + scalar)
 };
 
-///
-/// \brief    Tile is designed to organize loop over the input and output data. It is essentially a for(...) loop:
-/// it performs operations specified by enclosed emitters, advances iteration counters
-/// and breaks when necessary.
-///
-/// \param      in[0]    The number of input entities (or scheduler counts) processed during one iteration of the tile.
-///  It is expected to be 1 for outer or scalar tiles and vlen for vector tiles.
-class TileEmitter : public jit_container_emitter {
+class LoopEndEmitter : public jit_emitter {
 public:
-    TileEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
-
-    size_t get_inputs_num() const override {return 0;}
-    std::vector<AllocatedEmitter>& get_nested_code();
+    LoopEndEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n);
     void emit_code(const std::vector<size_t> &in,
                    const std::vector<size_t> &out,
                    const std::vector<size_t> &pool,
                    const std::vector<size_t> &gpr) const override;
-
-    void emit_body(const std::vector<size_t>& vec_pool, const std::vector<size_t>& gpr_pool) const;
-    void emit_ptr_increments(const std::vector<Reg64>& data_ptr_regs) const;
+    // todo: it is purely virtual in the base class, but do we need it?
+    size_t get_inputs_num() const override {return 0;}
 
 private:
     void validate_arguments(const std::vector<size_t> &in,
                             const std::vector<size_t> &out,
                             const std::vector<size_t> &pool,
                             const std::vector<size_t> &gpr) const override;
+
     void emit_impl(const std::vector<size_t>& in,
                    const std::vector<size_t>& out,
                    const std::vector<size_t>& pool,
                    const std::vector<size_t>& gpr,
                    const ov::intel_cpu::emitter_context *emit_context) const override;
 
+    std::shared_ptr<ngraph::snippets::op::LoopBegin> loop_begin;
+    std::shared_ptr<ngraph::snippets::op::LoopEnd> loop_end;
+
     size_t num_inputs = 0;
     size_t num_outputs = 0;
-    std::vector<size_t> io_dims {};
     std::vector<size_t> io_data_size {};
     size_t increment = 0;
+    size_t work_amount = 0;
+    bool evaluate_once = false;
+    std::vector<bool> apply_increments;
+    std::vector<int64_t> finalization_offsets;
 };
 
+
 class NopEmitter : public jit_emitter {
 public:
     NopEmitter(dnnl::impl::cpu::x64::jit_generator* h, dnnl::impl::cpu::x64::cpu_isa_t isa, const std::shared_ptr<ov::Node>& n)
@@ -205,7 +197,6 @@ class BroadcastMoveEmitter : public jit_emitter {
     void emit_isa(const std::vector<size_t> &in, const std::vector<size_t> &out) const;
 
 private:
-    bool use_broadcast;
     size_t byte_size = 0lu;
 };
 
@@ -239,7 +230,7 @@ class ScalarEmitter : public jit_emitter {
 /// it's illigal to load/store to the same address multiple times
 /// Typical application can be if Load and BroadcastLoad are performed from the same pointer.
 /// If Load goes before BroadcastLoad topologicaly the resilt will be incorrect
-/// For scalar loads we can use different tiles. Tiling indeed can be arbitrary and post increment should be somehow coded into ISA.
+/// For scalar loads we can use different loops. Tiling indeed can be arbitrary and post increment should be somehow coded into ISA.
 /// Blocked parameter to tell if input is actually blocked. Broadcast means broadcast by W in other cases no need to substitute load.
 class MemoryEmitter : public jit_emitter  {
 public:
@@ -354,6 +345,5 @@ class StoreConvertEmitter : public MemoryEmitter {
     size_t count;
     std::unique_ptr<jit_store_emitter> store_emitter = nullptr;
 };
-
 }   // namespace intel_cpu
 }   // namespace ov
diff --git a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp
index 5e4fda63190a5a..33d0e2e61fec60 100644
--- a/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp
+++ b/src/plugins/intel_cpu/src/ngraph_transformations/snippets_mark_skipped.cpp
@@ -81,7 +81,7 @@ bool canBePerformedAsScaleShift(const std::shared_ptr<const Node> &node, const i
             fusingPort = i;
             dataShape = node->get_input_partial_shape(i);
             // only one non-const parent is allowed
-            if (dataShape.is_dynamic() || ++numNonConstInputs != 1)
+            if (++numNonConstInputs != 1)
                 return false;
         } else {
             // every const parent must have exactly one child
@@ -97,8 +97,7 @@ bool canBePerformedAsScaleShift(const std::shared_ptr<const Node> &node, const i
             if (i == fusingPort)
                 continue;
             const ov::PartialShape weightShape = node->get_input_partial_shape(i);
-            if (weightShape.is_dynamic() ||
-                !isPerTensorOrPerChannelBroadcastable(dataShape.get_shape(), weightShape.get_shape(), channelAxis, true))
+            if (!isPerTensorOrPerChannelBroadcastable(dataShape.get_max_shape(), weightShape.get_max_shape(), channelAxis, true))
                 return false;
         }
         return true;
@@ -250,22 +249,20 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, con
                                     NodeFusingType &updatedChainType, int& fusingAxis) {
     int num_non_const_inputs = 0;
     bool can_be_converted_to_FC = false;
-    ov::Shape bias_shape;
-    ov::Shape matmul_shape;
+    ov::PartialShape bias_shape;
+    ov::PartialShape matmul_shape;
     for (const auto &parent_out : node->input_values()) {
         const auto parent = parent_out.get_node_shared_ptr();
         if (ngraph::op::is_constant(parent)) {
             bias_shape = parent_out.get_shape();
             num_non_const_inputs++;
         } else {
-            const auto pshape = parent_out.get_partial_shape();
-            if (pshape.is_dynamic() || pshape.get_shape().empty())
+              matmul_shape = parent_out.get_partial_shape();
+              if (matmul_shape.size() == 0)
                 return false;
-            matmul_shape = pshape.get_shape();
             const auto& grandparents = parent->input_values();
             // first check that weights are constant and both activations and weights have static shape
             if (grandparents.size() == 2 &&
-                grandparents[0].get_partial_shape().is_static() &&
                 grandparents[1].get_partial_shape().is_static() &&
                 ov::is_type<ov::op::v0::Constant>(grandparents[1].get_node_shared_ptr())) {
                 auto rank_a = grandparents[0].get_partial_shape().rank().get_length();
@@ -280,8 +277,9 @@ bool isSuitableChildForFusingMatMul(const std::shared_ptr<const Node> &node, con
 
     // Matmul / FC bias fusion
     if (ov::is_type<ngraph::opset1::Add>(node) &&
-        bias_shape.back() == matmul_shape.back() &&
-        bias_shape.back() == shape_size(bias_shape)) {
+        bias_shape.rbegin()->get_length() == matmul_shape.rbegin()->get_length() &&
+        bias_shape.is_static() &&
+        bias_shape.rbegin()->get_length() == shape_size(bias_shape.get_shape())) {
         return true;
     }
 
@@ -431,7 +429,7 @@ bool SnippetsMarkSkipped::run_on_model(const std::shared_ptr<ov::Model> &m) {
     RUN_ON_MODEL_SCOPE(SnippetsMarkSkipped);
     int channelAxis = DEFAULT_AXIS;
     for (auto &node : m->get_ordered_ops()) {
-        if (ngraph::op::is_constant(node))
+        if (ngraph::op::is_constant(node) || ov::is_type<ov::op::v0::Result>(node))
             continue;
 
         if (ngraph::op::is_parameter(node)) {
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index 503d989492f5e3..7ad5ebf1636d1f 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -22,6 +22,7 @@
 
 #include <snippets/op/subgraph.hpp>
 #include "emitters/cpu_generator.hpp"
+#include "utils/cpu_utils.hpp"
 #include "snippets_transformations/fuse_load_store_and_convert.hpp"
 #include "ngraph_transformations/convert_to_swish_cpu.hpp"
 
@@ -67,6 +68,7 @@ void Snippet::copy_snippet() {
     ngraph::copy_runtime_info(original_snippet, snippet);
     snippet->set_friendly_name(original_snippet->get_friendly_name());
     snippet->set_generator(std::make_shared<CPUGenerator>(host_isa));
+    isa_num_lanes =  snippet->get_generator()->get_target_machine()->get_lanes();
 }
 
 void Snippet::initSupportedPrimitiveDescriptors() {
@@ -89,7 +91,13 @@ void Snippet::initSupportedPrimitiveDescriptors() {
     // Todo: Snippets currently don't support per-channel broadcasting of Blocked descriptors because
     //  canonicalization can't distinguish between <N, C, H, W, c> and <N, C, D, H, W> cases.
     //  See snippets::op::Subgraph::canonicalize for details.
-    const bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims,  4, 5) && dimRanksAreEqual;
+    bool isBlockedApplicable = dnnl::impl::utils::one_of(ndims,  4, 5) && dimRanksAreEqual;
+
+    for (const auto& inShape : inputShapes) {
+        if (isDynamic && inShape.getRank() != 1)
+            isBlockedApplicable = isBlockedApplicable && inShape.getMinDims()[1] != Shape::UNDEFINED_DIM && inShape.getMinDims()[1] > 1;
+    }
+
     enum LayoutType {
         Planar,
         ChannelsFirst,
@@ -192,42 +200,6 @@ void Snippet::initSupportedPrimitiveDescriptors() {
 void Snippet::selectOptimalPrimitiveDescriptor() {
     selectPreferPrimitiveDescriptor(getPrimitivesPriority(), true);
 }
-
-void Snippet::createPrimitive() {
-    // schedule definition part
-    // it defines offsets, strides and sizes for snippet kernel scheduling
-    define_schedule();
-
-    // code generation part
-    // it might be worth to generate explicitly for scheduler work amount for now,
-    // but in future some interface should be defined in order to communicate schedule for a kernel
-    // or generate schedule for a kernel.
-    // Here kernel is generated for most warying dimension by default.
-    generate();
-}
-
-void Snippet::execute(dnnl::stream strm) {
-    if (schedule.ptr == nullptr || !canUseOptimizedImpl) {
-        IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference";
-    }
-    jit_snippets_call_args call_args;
-    for (size_t i = 0; i < srcMemPtrs.size(); i++)
-        call_args.src_ptrs[i] = reinterpret_cast<const uint8_t*>(srcMemPtrs[i]->GetData()) + start_offset_in[i];
-
-    for (size_t i = 0; i < dstMemPtrs.size(); i++)
-        call_args.dst_ptrs[i] = reinterpret_cast<uint8_t*>(dstMemPtrs[i]->GetData()) + start_offset_out[i];
-
-    if (tensorRank == rank6D) {
-        schedule_6d(call_args);
-    } else {
-        schedule_nt(call_args);
-    }
-}
-
-bool Snippet::created() const {
-    return getType() == Type::Subgraph;
-}
-
 InferenceEngine::Precision Snippet::getRuntimePrecision() const {
     std::vector<InferenceEngine::Precision> inputPrecisions;
     for (size_t i = 0; i < getParentEdges().size(); i++) {
@@ -240,264 +212,343 @@ InferenceEngine::Precision Snippet::getRuntimePrecision() const {
     return getMaxPrecision(inputPrecisions);
 }
 
-bool Snippet::canBeInPlace() const {
-    if (getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) {
-        return false;
-    }
-
-    if (getChildEdges().size() != 1) {
-        return false;
-    }
-
-    for (auto& parentEdge : getParentEdges()) {
-        auto parent = parentEdge.lock()->getParent();
-        if (parent->getChildEdges().size() != 1)
-            return false;
-
-        // WA to prevent memory corruption caused by inplace feature
-        if (parent->getType() == Type::Concatenation) {
-            for (auto& parentParentEdge : parent->getParentEdges()) {
-                auto parentParent = parentParentEdge.lock()->getParent();
-                if (parentParent->getChildEdges().size() != 1)
-                    return false;
-            }
+void Snippet::calcJITParams(std::vector<int64_t>& offsets) const {
+    const size_t numInputs = normInputShapes.size();
+    const size_t numParams = numInputs + normOutputShapes.size();
+
+    // Note that we don't need offset for the last dim, since it's handled directly by Tile emitter
+    const size_t offset_rank = masterShape.size() - 1;
+    offsets.resize(numParams * (offset_rank), 1);
+    auto offset_calculation = [offset_rank, this](int64_t *off, const std::vector<size_t>& dims, const size_t data_size) {
+        size_t k = dims.back();
+        for (int i = offset_rank - 1; i >= 0; i--) {
+            auto tmp = (dims[i] == masterShape[i] && masterShape[i] != 1) ? k : 0;
+            off[i] = tmp * data_size;
+            k *= dims[i];
         }
-    }
-    return getInputShapeAtPort(0) == getOutputShapeAtPort(0);
-}
-
-static void offset_calculation(std::vector<size_t>& offset, const std::vector<size_t>& dims_in, const std::vector<size_t>& dims_out) {
-    size_t k = 1;
-    for (int i = offset.size() - 1; i >= 0; i--) {
-        offset[i] = (dims_in[i] == dims_out[i]) ? k : 0;
-        k *= dims_in[i];
-    }
-}
-
-static auto collapseLastDims(std::vector<size_t>& dims, size_t dimsToCollapse) -> void {
-    if (dimsToCollapse >= dims.size() - 1)
-        IE_THROW() << "Got invalid number of dims to collapse. Expected < " << dims.size() - 1 << " got " << dimsToCollapse;
-    for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
-        dims[dims.size() - 1] *= dims[i];
-    }
-
-    for (int i = dims.size() - 2; i >= dimsToCollapse; i--) {
-        dims[i] = dims[i - dimsToCollapse];
-    }
-
-    for (int i = dimsToCollapse - 1; i >= 0; i--) {
-        dims[i] = 1;
-    }
-}
-
-void Snippet::define_schedule() {
-    auto edgeToBlockedShape = [](const EdgePtr& edge) {
-        const auto blockedDesc = edge->getMemory().GetDescWithType<BlockedMemoryDesc>();
-        ngraph::Shape shape(blockedDesc->getBlockDims());
-        ngraph::AxisVector blocking(blockedDesc->getOrder());
-        ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision());
-        return ngraph::snippets::op::Subgraph::BlockedShape{shape, blocking, precision};
-    };
-    auto prependWithOnes = [this](const std::vector<size_t>& dims) {
-        if (tensorRank <= dims.size())
-            return dims;
-        VectorDims result(tensorRank, 1);
-        std::copy(dims.begin(), dims.end(), &result[tensorRank - dims.size()]);
-        return result;
     };
-    ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes;
-    for (size_t i = 0; i < inputShapes.size(); i++)
-        input_blocked_shapes.push_back(edgeToBlockedShape(getParentEdgesAtPort(i)[0]));
-
-    ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
-    for (size_t i = 0; i < outputShapes.size(); i++)
-        output_blocked_shapes.push_back(edgeToBlockedShape(getChildEdgesAtPort(i)[0]));
-
-    exec_domain = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes);
-
-    // initialize by maximum output dimension. Dimensions of outputs should be broadcastable
-    tensorRank = std::max(static_cast<size_t>(rank6D), exec_domain.size());
-    // Canonicalization broadcasts inputs and outputs to max input rank, which can be smaller than tensorRank
-    // prepend to enable 6D scheduler
-    exec_domain = prependWithOnes(exec_domain);
-    const auto &body = snippet->get_body();
-    for (const auto& p : body->get_parameters()) {
-        dims_in.emplace_back(prependWithOnes(p->get_shape()));
-    }
-
-    for (size_t i = 0; i < body->get_output_size(); i++) {
-        dims_out.push_back(prependWithOnes(body->get_output_shape(i)));
+    for (size_t i = 0; i < numParams; i++) {
+            offset_calculation(&offsets[i * offset_rank],
+                           i < numInputs ? normInputShapes[i] : normOutputShapes[i - numInputs],
+                           dataSize[i]);
     }
-
-    const auto config = getSelectedPrimitiveDescriptor()->getConfig();
-    auto initOffsets = [this, config]() {
-        // find max rank input among all outputs
-        const size_t inputNum = getParentEdges().size();
-        offsets_in.resize(inputNum);
-        for (size_t i = 0; i < inputNum; i++) {
-            offsets_in[i].resize(tensorRank, 1);
-            offset_calculation(offsets_in[i], dims_in[i], exec_domain);
-            for (size_t j = 0; j < tensorRank; j++) {
-                offsets_in[i][j] *= config.inConfs[i].getMemDesc()->getPrecision().size();
+}
+void Snippet::optimizeExecDomain(std::vector<VectorDims>& inputShapes, std::vector<VectorDims>& outputShapes,
+                                 VectorDims &domain, size_t& TileRank) const {
+    const size_t minimalConcurrency = parallel_get_max_threads();
+    const size_t minimalJitWorkAmount = 256;
+    const size_t ds = domain.size();
+    if ( ds <= 2 || // not enough dimensions to collapse
+         domain[ds-1] >= minimalJitWorkAmount || // There is enough work for 1D Tiles, no need to collapse
+         domain[ds-1] * domain[ds-2] >= fullWorkAmount / minimalConcurrency) // There won't be enough work for every thread (even one iter) if we collapse
+        return;
+    auto findDimsToCollapse = [&]() {
+        auto collapseLastDims = [](VectorDims& dims, size_t dimsToCollapse) {
+            if (dimsToCollapse >= dims.size() - 1)
+                IE_THROW() << "Got invalid number of dims to collapse. Expected < " << dims.size() - 1 << " got " << dimsToCollapse;
+            for (int i = dims.size() - 2; i > dims.size() - dimsToCollapse - 2; i--) {
+                dims[dims.size() - 1] *= dims[i];
             }
-        }
 
-        start_offset_in.resize(inputNum);
-        srcMemPtrs.resize(inputNum);
-        for (size_t i = 0; i < inputNum; i++) {
-            const auto memPtr = getParentEdgeAt(i)->getMemoryPtr();
-            srcMemPtrs[i] = memPtr;
-            start_offset_in[i] =  memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() *
-                    config.inConfs[i].getMemDesc()->getPrecision().size();
-        }
-
-        const size_t outputNum = config.outConfs.size();
-        offsets_out.resize(outputNum);
-        for (size_t i = 0; i < outputNum; i++) {
-            offsets_out[i].resize(tensorRank, 1);
-            offset_calculation(offsets_out[i], dims_out[i], exec_domain);
-            for (size_t j = 0; j < tensorRank; j++) {
-                offsets_out[i][j] *= config.outConfs[i].getMemDesc()->getPrecision().size();
+            for (int i = dims.size() - 2; i >= dimsToCollapse; i--) {
+                dims[i] = dims[i - dimsToCollapse];
             }
-        }
 
-        start_offset_out.resize(outputNum);
-        dstMemPtrs.resize(outputNum);
-        for (size_t i = 0; i < outputNum; i++) {
-            const auto memPtr = getChildEdgeAt(i)->getMemoryPtr();
-            dstMemPtrs[i] = memPtr;
-            start_offset_out[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() *
-                    config.outConfs[i].getMemDesc()->getPrecision().size();
-        }
-    };
-
-    auto find_dims_to_collapse = [this, config]() -> int {
+            for (int i = dimsToCollapse - 1; i >= 0; i--) {
+                dims[i] = 1;
+            }
+        };
         int collapsedDims = 0;
-        size_t minimalConcurrency = parallel_get_max_threads();
-        size_t minimalJitWorkAmount = 256;
-        size_t currentJitWorkAmount = exec_domain.back();
+        size_t currentJitWorkAmount = domain[domain.size() - 1];
         while (currentJitWorkAmount < minimalJitWorkAmount && currentJitWorkAmount < fullWorkAmount) {
-            if (static_cast<int>(exec_domain.size()) - collapsedDims - 2 < 0)
+            if (static_cast<int>(domain.size()) - collapsedDims - 2 < 0)
                 break;
 
             bool canCollapse = true;
-            for (size_t i = 0; i < dims_in.size(); i++) {
-                if ((dims_in[i][dims_in[i].size() - 2] != 1 && dims_in[i][dims_in[i].size() - 1] == 1) ||
-                    (dims_in[i][dims_in[i].size() - 2] == 1 && dims_in[i][dims_in[i].size() - 1] != 1)) {
+            for (size_t i = 0; i < inputShapes.size(); i++) {
+                const size_t last = inputShapes[i].size() - 1;
+                if ((inputShapes[i][last - 1] != 1 && inputShapes[i][last] == 1) ||
+                    (inputShapes[i][last - 1] == 1 && inputShapes[i][last] != 1)) {
                     canCollapse = false;
                     break;
                 }
             }
 
-            size_t nextJitWorkAmount = currentJitWorkAmount * exec_domain[exec_domain.size() - 2];
+            size_t nextJitWorkAmount = currentJitWorkAmount * domain[domain.size() - 2];
             if (fullWorkAmount / nextJitWorkAmount >= minimalConcurrency) {
                 currentJitWorkAmount = nextJitWorkAmount;
                 // if we cannot use dim collapsing we should use tile2D
                 if (!canCollapse) {
-                    if (tileRank < maxTileRank) {
-                        tileRank++;
+                    if (TileRank < maxTileRank) {
+                        TileRank++;
                         continue;
                     }
 
                     break;
                 }
-
                 collapsedDims++;
-                for (auto &d : dims_in)
+                for (auto &d : inputShapes)
                     collapseLastDims(d, 1);
-
-                for (auto &d : dims_out)
+                for (auto &d : outputShapes)
                     collapseLastDims(d, 1);
-
-                collapseLastDims(exec_domain, 1);
+                collapseLastDims(domain, 1);
             } else {
                 break;
             }
         }
-        return collapsedDims;
+        return domain;
+    };
+    findDimsToCollapse();
+}
+ov::PartialShape Snippet::canonicalizeBody() {
+    auto edgeToBlockedShape = [](const EdgePtr& edge) {
+        const auto blockedDesc = edge->getMemory().GetDescWithType<BlockedMemoryDesc>();
+        std::vector<Dimension> dims;
+        // if blockDim == Shape::UNDEFINED_DIM, then it's a dynamic dimension, and we need to recreate a proper dynamic Dim
+        for (const auto& d : blockedDesc->getBlockDims())
+            dims.emplace_back(d == Shape::UNDEFINED_DIM ? -1 : d);
+        ngraph::PartialShape shape(dims);
+        ngraph::AxisVector blocking(blockedDesc->getOrder());
+        ngraph::element::Type precision = InferenceEngine::details::convertPrecision(blockedDesc->getPrecision());
+        return ngraph::snippets::op::Subgraph::BlockedShape{shape, blocking, precision};
     };
+    inputShapeIsBlocked.resize(inputShapes.size(), false);
+    masterShapeIsBlocked = false;
+    ngraph::snippets::op::Subgraph::BlockedShapeVector input_blocked_shapes;
+    for (size_t i = 0; i < inputShapes.size(); i++) {
+        auto blockedShape = edgeToBlockedShape(getParentEdgesAtPort(i)[0]);
+        inputShapeIsBlocked[i] = std::get<0>(blockedShape).size() != std::get<1>(blockedShape).size();
+        masterShapeIsBlocked = masterShapeIsBlocked || inputShapeIsBlocked[i];
+        input_blocked_shapes.push_back(blockedShape);
+    }
 
-    auto initSchedulingInfo = [this, config]() -> void {
-        // initialize scheduling information
-        sch_offsets_in.resize(offsets_in.size(), 0);
-        sch_offsets_out.resize(offsets_out.size(), 0);
-        sch_dims.resize(maxTileRank, 1);
-        sch_dims[maxTileRank-1] = exec_domain.back();
-        schedulerWorkAmount = fullWorkAmount / exec_domain.back();
-        if (tileRank > 1) {
-            sch_dims[maxTileRank - tileRank] = exec_domain[tensorRank - 2];
-            schedulerWorkAmount /= exec_domain[tensorRank - 2];
-            exec_domain[tensorRank - 2] = 1;
-
-            // update offsets for tile 2D because loaders and stores have ptr shifts in some cases
-            const int64_t vector_size = snippet->get_generator()->get_target_machine()->get_lanes();
-            for (size_t i = 0; i < offsets_in.size(); i++) {
-                const int64_t offset = offsets_in[i][tensorRank - 2];
-                const int64_t data_size = config.inConfs[i].getMemDesc()->getPrecision().size();
-                if (offset == data_size || offset == vector_size * data_size) {
-                    sch_offsets_in[i] = offset;
-                } else if ((offset > data_size) || (offset == 0 && dims_in[i].back() != 1 && dims_in[i].back() != vector_size)) {
-                    sch_offsets_in[i] = offset - exec_domain.back() * data_size;
-
-                    // If scalar tile executes one time, ptr doesn't move on 1 value
-                    // so we should absolutelly decrease offset
-                    if (exec_domain.back() % vector_size == 1) {
-                        sch_offsets_in[i] += data_size;
-                    }
-                }
-            }
+    outputShapeIsBlocked.resize(outputShapes.size(), false);
+    ngraph::snippets::op::Subgraph::BlockedShapeVector output_blocked_shapes;
+    for (size_t i = 0; i < outputShapes.size(); i++) {
+        auto blockedShape = edgeToBlockedShape(getChildEdgesAtPort(i)[0]);
+        outputShapeIsBlocked[i] = std::get<0>(blockedShape).size() != std::get<1>(blockedShape).size();
+        output_blocked_shapes.push_back(blockedShape);
+    }
 
-            for (size_t i = 0; i < offsets_out.size(); i++) {
-                const int64_t offset = offsets_out[i][tensorRank - 2];
-                const size_t data_size = config.outConfs[i].getMemDesc()->getPrecision().size();
-                if (offset == data_size || offset == vector_size * data_size) {
-                    sch_offsets_out[i] = offset;
-                } else if ((offset > data_size) || (offset == 0 && dims_out[i].back() != 1 && dims_out[i].back() != vector_size)) {
-                    sch_offsets_out[i] = offset - exec_domain.back() * data_size;
-
-                    // If scalar tile executes one time, ptr doesn't move on 1 value
-                    // so we should absolutelly decrease offset
-                    if (exec_domain.back() % vector_size == 1) {
-                        sch_offsets_out[i] += data_size;
-                    }
+    const auto canonicalShape = snippet->canonicalize(output_blocked_shapes, input_blocked_shapes);
+    return canonicalShape;
+}
+void Snippet::createPrimitive() {
+    // determine canonicalize, determine master_shape and prepend up to 6D
+    // NB! normInputShapes are updated, so body reshape might be needed
+    const auto& canonicalShape = canonicalizeBody();
+    // initialize by maximum output dimension. Dimensions of outputs should be broadcastable
+    tensorRank = std::max(static_cast<size_t>(rank6D), canonicalShape.size());
+
+    const auto config = getSelectedPrimitiveDescriptor()->getConfig();
+    auto initDataSizes = [this, config]() {
+        const size_t numInputs = inputShapes.size();
+        const size_t numOutputs = outputShapes.size();
+        dataSize.resize(numInputs + numOutputs);
+        for (size_t i = 0; i < numInputs; i++)
+            dataSize[i] = config.inConfs[i].getMemDesc()->getPrecision().size();
+        for (size_t i = 0; i < numOutputs; i++)
+            dataSize[i + numInputs] = config.outConfs[i].getMemDesc()->getPrecision().size();
+    };
+    initDataSizes();
+
+    jit_snippets_compile_args jcp;
+    if (canonicalShape.is_dynamic())
+        IE_THROW() << "Snippets: Canonicalization returned dynamic shape in static pipeline";
+    masterShape = canonicalShape.get_shape();
+    const auto &body = snippet->get_body();
+    for (const auto& p : body->get_parameters())
+        normInputShapes.emplace_back(p->get_output_shape(0));
+    for (const auto& r : body->get_results())
+        normOutputShapes.emplace_back(r->get_input_shape(0));
+
+    prepareParams();
+    jcp.master_shape = masterShape;
+    std::copy(data_offsets.begin(), data_offsets.end(), jcp.data_offsets);
+    generate(&jcp);
+}
+
+std::vector<VectorDims> Snippet::shapeInfer() const {
+    // todo: it's very strange that we don't have broadcast_merge_into for cpu shapes
+    auto broadcast_merge = [](VectorDims& dst, const VectorDims& src){
+        // Ranks are both static.
+        auto dst_rank = dst.size();
+        auto src_rank = src.size();
+        const auto new_rank = std::max(dst_rank, src_rank);
+        dst.insert(dst.begin(), new_rank - dst_rank, 1);
+        std::vector<Dimension> dims(new_rank);
+        bool success = true;
+        for (int64_t i = 0; i < new_rank; i++) {
+            auto dsti = i < (new_rank - dst_rank) ? 1 : dst[i - (new_rank - dst_rank)];
+            auto srci = i < (new_rank - src_rank) ? 1 : src[i - (new_rank - src_rank)];
+            if (dsti != srci && srci != Shape::UNDEFINED_DIM) {
+                if (dsti == 1 || dsti == Shape::UNDEFINED_DIM) {
+                    dsti = srci;
+                } else {
+                    success = false;
                 }
             }
         }
+        return success;
     };
+    for (size_t i = 0; i < getParentEdges().size(); i++) {
+        VectorDims inDims {getParentEdgesAtPort(i)[0]->getMemory().GetShape().getDims()};
+        if (masterShapeIsBlocked && !inputShapeIsBlocked[i])
+            inDims.insert(inDims.end(), 1);
+        // todo: this is a simple master_shape inference for shape-agnostic operations,
+        //  we'll need to account for body operations semantics in the future
+        if (i == 0)
+            masterShape = inDims;
+        else
+            broadcast_merge(masterShape, inDims);
+        normInputShapes[i] = std::move(inDims);
+    }
+    if (std::any_of(masterShape.begin(), masterShape.end(), [](const Dim& d){ return d == Shape::UNDEFINED_DIM;})) {
+        std::ostringstream errorMessage;
+        errorMessage << "Can't compute static master shape for Snippet node with name: " << getName();
+        errorMessage << ". Input shapes = ( ";
+        for (size_t i = 0; i < getParentEdges().size(); i++) {
+            errorMessage << i << " port = " << getParentEdgesAtPort(i)[0]->getMemory().GetShape().toString() << ", ";
+        }
+        errorMessage << "). Master shape = ( " << masterShape << " )";
+        IE_THROW() << errorMessage.str();
+    }
 
-    fullWorkAmount = 1;
-    for (const auto &d : exec_domain) {
-        fullWorkAmount *= d;
+    if (normOutputShapes.size() == 1) {
+        normOutputShapes[0] = masterShape;
+        return {masterShape};
     }
+    std::vector<VectorDims> outputDims;
+    std::vector<ov::Shape> new_shapes;
+    for (const auto& s : normInputShapes)
+        new_shapes.emplace_back(s);
+    const auto& outputShapes = snippet->reshape_body(new_shapes);
+    for (size_t i = 0; i < outputShapes.size(); i++)
+            normOutputShapes[i] = outputShapes[i];
+    return normOutputShapes;
+}
 
-    batchDimIdx = tensorRank - exec_domain.size();
-    // Note that exec_domain can be modified inside find_dims_to_collapse() and/or initSchedulingInfo()
-    find_dims_to_collapse();
+void Snippet::prepareParams() {
+    // here must be all the stuff that could only be done for static shapes, e.g. offset calculation
+    // Here it must be all the stuff that could be done once for both static and dynamic shapes
+
+    masterShape = getNormalizedDimsBySize(masterShape, tensorRank);
+    for (auto& pshape : normInputShapes)
+        pshape = getNormalizedDimsBySize(pshape, tensorRank);
+    for (auto& pshape : normOutputShapes)
+        pshape = getNormalizedDimsBySize(pshape, tensorRank);
+
+    tileRank = 1;
+    fullWorkAmount = std::accumulate(masterShape.begin(), masterShape.end(), 1, std::multiplies<size_t>());
+    // optimizeExecDomain will collapse shape dimensions and adjust tile Rank
+    optimizeExecDomain(normInputShapes, normOutputShapes, masterShape, tileRank);
+    exec_domain = masterShape;
+
+    // todo: probably better to pass a call_args instance
+    calcJITParams(data_offsets);
+    auto initStartMemoryOffsets = [this]() {
+        const auto config = getSelectedPrimitiveDescriptor()->getConfig();
+        const size_t numInputs = inputShapes.size();
+        start_offset_in.resize(numInputs);
+        srcMemPtrs.resize(numInputs);
+        for (size_t i = 0; i < numInputs; i++) {
+            const auto memPtr = getParentEdgeAt(i)->getMemoryPtr();
+            srcMemPtrs[i] = memPtr;
+            start_offset_in[i] =  memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() * dataSize[i];
+        }
+        const size_t numOutputs = outputShapes.size();
+        start_offset_out.resize(numOutputs);
+        dstMemPtrs.resize(numOutputs);
+        for (size_t i = 0; i < numOutputs; i++) {
+            const auto memPtr = getChildEdgeAt(i)->getMemoryPtr();
+            dstMemPtrs[i] = memPtr;
+            start_offset_out[i] = memPtr->GetDescWithType<BlockedMemoryDesc>()->getOffsetPadding() * dataSize[i + numInputs];
+        }
+    };
+    // initialize start offsets to src and dst memory
+    // Needs to be done for every set of input shapes sce memory ptrs could've updated
+    initStartMemoryOffsets();
+    std::vector<size_t> scheduler_work_amounts;
+    // rename schedulerWorkAmount to harnessWorkAmount?
+    harnessWorkAmount = fullWorkAmount;
+    const auto rank = exec_domain.size();
+    for (auto i = rank - tileRank; i < rank; i++) {
+        auto& dim = exec_domain[i];
+        harnessWorkAmount /= dim;
+        scheduler_work_amounts.push_back(dim);
+        dim = 1;
+    }
 
-    initOffsets();
-    initSchedulingInfo();
+    std::vector<ov::Shape> new_shapes;
+    for (const auto& s : normInputShapes) {
+        ov::Shape ns(tileRank, 0);
+        const int offset = s.size() - tileRank;
+        // todo: this check is excessive, remove it before merge
+        if (offset < 0)
+            IE_THROW() << "Error during creating reduced body shapes: tileRank is larger than the input size";
+        std::copy(s.begin() + offset, s.end(), ns.begin());
+        new_shapes.emplace_back(std::move(ns));
+    }
+    snippet->set_master_shape(PartialShape(scheduler_work_amounts));
+    snippet->reshape_body(new_shapes);
 }
 
-void Snippet::generate() {
-    jit_snippets_compile_args jcp;
-    jcp.output_dims = exec_domain;
-    std::copy(sch_dims.begin(), sch_dims.end(), jcp.scheduler_dims);
-    std::copy(sch_offsets_in.begin(), sch_offsets_in.end(), jcp.scheduler_offsets);
-    std::copy(sch_offsets_out.begin(), sch_offsets_out.end(), &jcp.scheduler_offsets[sch_offsets_in.size()]);
-    size_t harness_num_dims = jcp.output_dims.size() - 1;
-    if (harness_num_dims > SNIPPETS_MAX_HARNESS_DIMS) {
-        canUseOptimizedImpl = false;
-        harness_num_dims = SNIPPETS_MAX_HARNESS_DIMS;
+bool Snippet::needPrepareParams() const {
+    return inputShapesModified() || !schedule.ptr;
+}
+
+void Snippet::updateSrcDstPtrs(jit_snippets_call_args& call_args) const {
+    for (size_t i = 0; i < srcMemPtrs.size(); i++)
+        call_args.src_ptrs[i] = reinterpret_cast<const uint8_t*>(srcMemPtrs[i]->GetData()) + start_offset_in[i];
+
+    for (size_t i = 0; i < dstMemPtrs.size(); i++)
+        call_args.dst_ptrs[i] = reinterpret_cast<uint8_t*>(dstMemPtrs[i]->GetData()) + start_offset_out[i];
+}
+
+void Snippet::execute(dnnl::stream strm) {
+    if (schedule.ptr == nullptr) {
+        IE_THROW() << "Snippet can't use Optimized implementation and can't fallback to reference";
     }
-    for (size_t i = 0; i < inputShapes.size(); i++) {
-        auto b = offsets_in[i].begin();
-        std::copy(b, b + harness_num_dims, &jcp.data_offsets[i * harness_num_dims]);
+    jit_snippets_call_args call_args;
+    updateSrcDstPtrs(call_args);
+
+    if (tensorRank == rank6D) {
+        schedule_6d(call_args);
+    } else {
+        schedule_nt(call_args);
     }
-    for (size_t i = 0; i < outputShapes.size(); i++) {
-        auto b = offsets_out[i].begin();
-        std::copy(b, b + harness_num_dims, &jcp.data_offsets[(inputShapes.size() + i) * harness_num_dims]);
+}
+
+bool Snippet::canBeInPlace() const {
+    if (isDynamic || getParentEdgesAtPort(0)[0]->getParent()->getType() == Type::Input) {
+        return false;
     }
+    if (getChildEdges().size() != 1) {
+        return false;
+    }
+
+    for (auto& parentEdge : getParentEdges()) {
+        auto parent = parentEdge.lock()->getParent();
+        if (parent->getChildEdges().size() != 1)
+            return false;
 
+        // WA to prevent memory corruption caused by inplace feature
+        if (parent->getType() == Type::Concatenation) {
+            for (auto& parentParentEdge : parent->getParentEdges()) {
+                auto parentParent = parentParentEdge.lock()->getParent();
+                if (parentParent->getChildEdges().size() != 1)
+                    return false;
+            }
+        }
+    }
+    return getInputShapeAtPort(0) == getOutputShapeAtPort(0);
+}
+
+bool Snippet::created() const {
+    return getType() == Type::Subgraph;
+}
+
+void Snippet::generate(const jit_snippets_compile_args* jcp) {
     ov::pass::Manager optManager;
     optManager.register_pass<ov::intel_cpu::pass::FuseLoadConvert>();
     optManager.register_pass<ov::intel_cpu::pass::FuseStoreConvert>();
@@ -518,8 +569,7 @@ void Snippet::generate() {
                     return convert->get_input_element_type(0) != ov::element::f32;
                 return true;
             });
-
-    schedule = snippet->generate(optManager, reinterpret_cast<void*>(&jcp));
+    schedule = snippet->generate(optManager, reinterpret_cast<const void*>(jcp));
 }
 
 void Snippet::schedule_6d(const jit_snippets_call_args& call_args) const {
@@ -536,7 +586,7 @@ void Snippet::schedule_nt(const jit_snippets_call_args& call_args) const {
     const auto& work_size = exec_domain;
     parallel_nt(0, [&](const int ithr, const int nthr) {
         size_t start = 0, end = 0;
-        splitter(schedulerWorkAmount, nthr, ithr, start, end);
+        splitter(harnessWorkAmount, nthr, ithr, start, end);
 
         std::vector<int64_t> indexes(work_size.size() - 1, 0);
         for (size_t iwork = start; iwork < end; ++iwork) {
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h
index 9c302555fb6823..0dc2354f02cd53 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.h
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -38,6 +38,9 @@ class Snippet : public Node {
 
     // Here we convert to canonical for & jit everything
     void createPrimitive() override;
+    void prepareParams() override;
+    std::vector<VectorDims> shapeInfer() const override;
+    bool needPrepareParams() const override;
 
     bool canBeInPlace() const override;
     bool created() const override;
@@ -55,10 +58,12 @@ class Snippet : public Node {
     // NOTE: Before call mutex should be initialized
     void copy_snippet();
 
-    void define_schedule();
-
-    void generate();
+    ov::PartialShape canonicalizeBody();
+    void optimizeExecDomain(std::vector<VectorDims>&, std::vector<VectorDims>&, VectorDims&, size_t&) const;
+    void calcJITParams(std::vector<int64_t>& offsets) const;
 
+    void generate(const jit_snippets_compile_args*);
+    void updateSrcDstPtrs(jit_snippets_call_args&) const;
     // Evaluates generated snippet using parallel backend
     void schedule_6d(const jit_snippets_call_args& const_args) const;
     void schedule_nt(const jit_snippets_call_args& const_args) const;
@@ -73,34 +78,36 @@ class Snippet : public Node {
 
     // Holds ISA version used is codeGeneration target
     dnnl::impl::cpu::x64::cpu_isa_t host_isa;
+    size_t isa_num_lanes; // number of elements that fit in vector size
 
     // Holds index of output used as in execution domain
     // it should be compatible with a schedule's work size
     std::vector<size_t> exec_domain = {};
 
     /// scheduling info
-    size_t batchDimIdx = 0;
     size_t tensorRank = 0;
     size_t tileRank = 1;
     size_t fullWorkAmount = 0;
-    size_t schedulerWorkAmount = 0;
+    size_t harnessWorkAmount = 0;
     const size_t maxTileRank = 2;
 
     std::vector<MemoryPtr> srcMemPtrs = {};
     std::vector<MemoryPtr> dstMemPtrs = {};
+    std::vector<size_t> dataSize = {};
 
-    std::vector<std::vector<size_t>> dims_in = {};
-    std::vector<std::vector<size_t>> offsets_in = {};
-    std::vector<ptrdiff_t> start_offset_in = {};
-    std::vector<ptrdiff_t> start_offset_out = {};
+    std::vector<int64_t> data_offsets;
+    // this is needed for fast shape inference of blocking-invariant prepended shapes
+    std::vector<bool> inputShapeIsBlocked = {}; // we need this info to shape-infer mixed layouts
+    std::vector<bool> outputShapeIsBlocked = {}; // we need this info to shape-infer mixed layouts
+    bool masterShapeIsBlocked = false;
 
-    std::vector<std::vector<size_t>> dims_out = {};
-    std::vector<std::vector<size_t>> offsets_out = {};
+    // master shape is mutable since we need to modify it inside const shapeInfer method
+    mutable VectorDims masterShape = {};
+    mutable std::vector<VectorDims> normInputShapes = {};
+    mutable std::vector<VectorDims> normOutputShapes = {};
 
-    std::vector<int64_t> sch_dims = {};
-    std::vector<int64_t> sch_offsets_in = {};
-    std::vector<int64_t> sch_offsets_out = {};
-    bool canUseOptimizedImpl = true;
+    std::vector<ptrdiff_t> start_offset_in = {};
+    std::vector<ptrdiff_t> start_offset_out = {};
 };
 
 }   // namespace node
diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp
index d9252bf2ecbe21..806dede4417a89 100644
--- a/src/plugins/intel_cpu/src/plugin.cpp
+++ b/src/plugins/intel_cpu/src/plugin.cpp
@@ -650,7 +650,6 @@ static void TransformationUpToCPUSpecificOpSet(std::shared_ptr<ngraph::Function>
                         if (n->inputs().size() > 1 && !ov::is_type<const ov::op::v0::Constant>(n->get_input_node_shared_ptr(1)))
                             return true;
                     }
-
                     const auto& inputs = n->inputs();
                     // todo: clarify whether we can evaluate snippets on const paths
                     const bool has_only_const_inputs = std::all_of(inputs.begin(), inputs.end(),
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp
index ebc3685c80a3c3..9c5c2a9904a48d 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/add.cpp
@@ -22,15 +22,40 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, Add,
                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
         Add::getTestCaseName);
 
+
+namespace snippets_static_1 {
+// These  inputs are needed to test static Loop optimizations (emit the whole tile, body with increments, set WA etc)
+std::vector<ov::Shape> inShapesStatic1{{1, 16, 29,  1}, {1, 16, 29,  7}, {1, 16, 29,  8}, {1, 16, 29,  15}, {1, 16, 29,  16}, {1, 16, 29,  31}};
+std::vector<ov::Shape> inShapesStatic2{{1, 16, 29,  1}, {1, 16, 1, 1}, {1, 1, 1, 1}};
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinh,
-        ::testing::Combine(
-        ::testing::Values(ov::Shape {1, 42, 16, 64}),
-        ::testing::Values(ov::Shape {1, 42, 16,  1}),
-        ::testing::Values(ov::element::f32),
-        ::testing::Values(3), // Add + 2 sinh after inputs
-        ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
-        ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         ::testing::Combine(
+                             ::testing::ValuesIn(inShapesStatic1),
+                             ::testing::ValuesIn(inShapesStatic2),
+                             ::testing::Values(ov::element::f32),
+                             ::testing::Values(3), // Add + 2 converts after inputs
+                             ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
+                             ::testing::Values(CommonTestUtils::DEVICE_CPU)),
                          AddSinh::getTestCaseName);
+// test cross-tile (vector vs scalar) optimizations in the absence of vector tile
+std::vector<std::vector<ov::Shape>> inShapesStatic{
+        {{1, 128, 1, 1}, {1, 128, 1, 1}},
+        {{1, 128, 1, 9}, {1, 128, 1, 9}},
+        {{1, 128, 1, 17}, {1, 128, 1, 17}},
+        {{1, 128, 1, 29}, {1, 128, 1, 29}},
+        {{1, 128, 1, 33}, {1, 128, 1, 33}},
+        {{1, 128, 9, 30}, {1, 128, 1, 30}},
+        {{1, 128, 9, 1}, {1, 128, 1, 30}},
+};
+INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinhPair,
+                         ::testing::Combine(
+                                 ::testing::ValuesIn(inShapesStatic),
+                                 ::testing::Values(ov::element::f32),
+                                 ::testing::Values(3), // Add + 2 converts after inputs
+                                 ::testing::Values(1), // Subgraph is created, since the inputs are followed by converts
+                                 ::testing::Values(CommonTestUtils::DEVICE_CPU)),
+                         AddSinhPair::getTestCaseName);
+
+} // namespace snippets_static_1
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Eltwise, AddSinhConst,
         ::testing::Combine(
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp
index 323e069ebc0a5b..4b32e9ded8657b 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/convert.cpp
@@ -30,10 +30,10 @@ const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::elem
         { { ov::element::u8 }, { ov::element::i8 } },
 };
 
-const std::vector<std::vector<ov::Shape>> inputShapes_Convert = {
-        { ov::Shape{2, 16} },
-        { ov::Shape{5, 5} },
-        { ov::Shape{2, 12, 1} }
+const std::vector<std::vector<ov::PartialShape>> inputShapes_Convert = {
+        { ov::PartialShape{2, 16} },
+        { ov::PartialShape{5, 5} },
+        { ov::PartialShape{2, 12, 1} }
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_Convert, Convert,
@@ -57,10 +57,10 @@ const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::elem
         { { ov::element::u8 }, { ov::element::bf16 } },
 };
 
-const std::vector<std::vector<ov::Shape>> inputShapes_ConvertInput = {
-        { ov::Shape{2, 16}, ov::Shape{1, 16} },
-        { ov::Shape{5, 18}, ov::Shape{5, 1} },
-        { ov::Shape{3, 1}, ov::Shape{3, 21} }
+const std::vector<std::vector<ov::PartialShape>> inputShapes_ConvertInput = {
+        { ov::PartialShape{2, 16}, ov::PartialShape{1, 16} },
+        { ov::PartialShape{5, 18}, ov::PartialShape{5, 1} },
+        { ov::PartialShape{3, 1}, ov::PartialShape{3, 21} }
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertInput, ConvertInput,
@@ -94,10 +94,10 @@ const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::elem
         { { ov::element::i8, ov::element::i8, ov::element::f32 }, { ov::element::f32, ov::element::i8 } },
 };
 
-const std::vector<std::vector<ov::Shape>> inputShapes_ConvertPartialInputsAndResults = {
-        { ov::Shape{2, 16}, ov::Shape{1, 16}, ov::Shape{1, 1} },
-        { ov::Shape{5, 18}, ov::Shape{5, 1}, ov::Shape{1, 18} },
-        { ov::Shape{3, 1}, ov::Shape{3, 21}, ov::Shape{3, 1} }
+const std::vector<std::vector<ov::PartialShape>> inputShapes_ConvertPartialInputsAndResults = {
+        { ov::PartialShape{2, 16}, ov::PartialShape{1, 16}, ov::PartialShape{1, 1} },
+        { ov::PartialShape{5, 18}, ov::PartialShape{5, 1}, ov::PartialShape{1, 18} },
+        { ov::PartialShape{3, 1}, ov::PartialShape{3, 21}, ov::PartialShape{3, 1} }
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertPartialInputsAndResults, ConvertPartialInputsAndResults,
@@ -117,7 +117,7 @@ const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::elem
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputs, ConvertManyOnInputs,
                          ::testing::Combine(
-                                 ::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
+                                 ::testing::Values(std::vector<ov::PartialShape>{{5, 5, 5, 5}}),
                                  ::testing::ValuesIn(types_ConvertMany),
                                  ::testing::Values(2),
                                  ::testing::Values(1),
@@ -126,7 +126,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputs, ConvertManyOnInputs
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnOutputs, ConvertManyOnOutputs,
                          ::testing::Combine(
-                                 ::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
+                                 ::testing::Values(std::vector<ov::PartialShape>{{5, 5, 5, 5}}),
                                  ::testing::ValuesIn(types_ConvertMany),
                                  ::testing::Values(2), // sinh + subgraph
                                  ::testing::Values(1),
@@ -140,7 +140,7 @@ const std::vector<std::pair<std::vector<ov::element::Type>, std::vector<ov::elem
 
 INSTANTIATE_TEST_SUITE_P(smoke_Snippets_ConvertManyOnInputOutput, ConvertManyOnInputOutput,
                          ::testing::Combine(
-                                 ::testing::Values(std::vector<ov::Shape>{ov::Shape{5, 5, 5, 5}}),
+                                 ::testing::Values(std::vector<ov::PartialShape>{{5, 5, 5, 5}}),
                                  ::testing::ValuesIn(types_ConvertManyIO),
                                  ::testing::Values(2), // sinh + subgraph
                                  ::testing::Values(1),
diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp
index fa182cf548a937..90392416e4591c 100644
--- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp
+++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/snippets/two_inputs_and_outputs.cpp
@@ -10,7 +10,7 @@ namespace test {
 namespace snippets {
 namespace {
 
-const std::vector<std::vector<ov::Shape>> input_shapes = {
+const std::vector<std::vector<ov::PartialShape>> input_shapes = {
         { {5, 5, 256, 1}, {5, 5, 256, 1} },
         { {5, 5, 16, 35}, {5, 5, 16, 35} },
         { {5, 5, 256, 1}, {5, 5, 256, 35} },
@@ -26,7 +26,6 @@ const std::vector<std::vector<ov::Shape>> input_shapes = {
 
         { {5, 5, 35, 17}, {5, 5, 35, 17} },
         { {5, 5, 35, 17}, {5, 5, 1, 17} },
-
         { {5, 5, 35, 18}, {5, 5, 35, 18} },
         { {5, 5, 35, 18}, {5, 5, 1, 18} },
 };
diff --git a/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp b/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp
index c02eb1a2a45de8..9aab3ffdfe7a01 100644
--- a/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp
+++ b/src/plugins/intel_cpu/tests/unit/ngraph_transformations/snipptes_mark_skipped.cpp
@@ -31,7 +31,7 @@ TEST_F(SnippetsMarkSkippedTests, smoke_Snippets_SkipAfterInputsEltwise) {
 }
 
 TEST_F(SnippetsMarkSkippedTests, smoke_Snippets_SkipAfterInputsMatMulEltwise) {
-    const auto &f = MatMulEltwiseBranchesFunction(std::vector<Shape> {{1, 3, 4, 4}, {1, 3, 4, 4}});
+    const auto &f = MatMulEltwiseBranchesFunction(std::vector<PartialShape> {{1, 3, 4, 4}, {1, 3, 4, 4}});
     function = f.getOriginal();
     // Fully tokenizable, since inputs are followed by MatMul
     function_ref = f.getReference();
@@ -42,7 +42,7 @@ TEST_F(SnippetsMarkSkippedTests, smoke_Snippets_SkipConvFused_ConvMulActivation)
     std::vector<std::shared_ptr<Node>> eltwiseOps {std::make_shared<ov::op::v1::Multiply>(),
                                                    std::make_shared<ov::op::v0::Tanh>(),
                                                    std::make_shared<ov::op::v0::Sqrt>()};
-    std::vector<Shape> inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}};
+    std::vector<PartialShape> inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}};
     const auto &f = ConvMulActivationFunction(inputShapes, eltwiseOps);
     function = f.getOriginal();
     // Fully tokenizable, since Mul with 2 inputs isn't fused into Convolution
@@ -54,7 +54,7 @@ TEST_F(SnippetsMarkSkippedTests, smoke_SkipConvFused_ConvSumActivation) {
     std::vector<std::shared_ptr<Node>> eltwiseOps {std::make_shared<ov::op::v1::Add>(),
                                                    std::make_shared<ov::op::v0::Tanh>(),
                                                    std::make_shared<ov::op::v0::Sqrt>()};
-    std::vector<Shape> inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}};
+    std::vector<PartialShape> inputShapes {{1, 2, 16, 16}, {1, 2, 1, 16}};
     const auto &f = ConvMulActivationFunction(inputShapes, eltwiseOps);
     function = f.getOriginal();
     // Not tokenizable, since Add + Eltwises can be fused into Convolution
diff --git a/src/tests/functional/plugin/shared/include/snippets/add.hpp b/src/tests/functional/plugin/shared/include/snippets/add.hpp
index 84338e53215f3a..3f19a02737980c 100644
--- a/src/tests/functional/plugin/shared/include/snippets/add.hpp
+++ b/src/tests/functional/plugin/shared/include/snippets/add.hpp
@@ -19,6 +19,14 @@ typedef std::tuple<
         std::string                  // Target Device
 > AddParams;
 
+typedef std::tuple<
+        std::vector<ov::Shape>,      // Input 0, Input 1 Shape
+        ov::element::Type,           // Element type
+        size_t,                      // Expected num nodes
+        size_t,                      // Expected num subgraphs
+        std::string                  // Target Device
+> AddParamsPair;
+
 typedef std::tuple<
         ov::Shape,                   // Input 0 Shape
         ov::element::Type,           // Element type
@@ -41,6 +49,15 @@ class AddSinh : public Add {
     void SetUp() override;
 };
 
+// repack AddSinh input shapes into shape vector to cover some cases easier
+class AddSinhPair : public testing::WithParamInterface<ov::test::snippets::AddParamsPair>,
+                    virtual public ov::test::SnippetsTestsCommon {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddParamsPair> obj);
+protected:
+    void SetUp() override;
+};
+
 class AddSinhConst : public testing::WithParamInterface<ov::test::snippets::AddConstParams>,
                      virtual public ov::test::SnippetsTestsCommon {
 public:
diff --git a/src/tests/functional/plugin/shared/include/snippets/convert.hpp b/src/tests/functional/plugin/shared/include/snippets/convert.hpp
index bd4d7641711a0a..fe534480fc4268 100644
--- a/src/tests/functional/plugin/shared/include/snippets/convert.hpp
+++ b/src/tests/functional/plugin/shared/include/snippets/convert.hpp
@@ -11,7 +11,7 @@ namespace test {
 namespace snippets {
 
 typedef std::tuple<
-        std::vector<ov::Shape>,                                                     // InputShapes
+        std::vector<ov::PartialShape>,                                              // InputShapes
         std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>>,  // Input and Output data types for Converts
         size_t,                                                                     // Expected num nodes
         size_t,                                                                     // Expected num subgraphs
diff --git a/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp b/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp
index bb39b7ded31678..a8b3c202b78a0c 100644
--- a/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp
+++ b/src/tests/functional/plugin/shared/include/snippets/three_inputs_eltwise.hpp
@@ -19,6 +19,15 @@ typedef std::tuple<
         std::string                  // Target Device
 > ThreeInputsEltwiseParams;
 
+typedef std::tuple<
+    InputShape,                   // Input 0 Shape
+    InputShape,                   // Input 1 Shape
+    InputShape,                   // Input 2 Shape
+    size_t,                      // Expected num nodes
+    size_t,                      // Expected num subgraphs
+    std::string                  // Target Device
+    > ThreeInputsEltwiseDynamicParams;
+
 class ThreeInputsEltwise : public testing::WithParamInterface<ov::test::snippets::ThreeInputsEltwiseParams>,
                    virtual public ov::test::SnippetsTestsCommon {
 public:
@@ -33,7 +42,6 @@ class ThreeInputsEltwiseSinh : public ThreeInputsEltwise {
     void SetUp() override;
 };
 
-
 } // namespace snippets
 } // namespace test
 } // namespace ov
diff --git a/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp
index 0a209de2fe9244..4284ceacfa4541 100644
--- a/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp
+++ b/src/tests/functional/plugin/shared/include/snippets/two_inputs_and_outputs.hpp
@@ -11,7 +11,7 @@ namespace test {
 namespace snippets {
 
 typedef std::tuple<
-        std::vector<ov::Shape>,      // Input Shape All shapes
+        std::vector<ov::PartialShape>,      // Input Shape All shapes
         size_t,                      // Expected num nodes
         size_t,                      // Expected num subgraphs
         std::string                  // Target Device
diff --git a/src/tests/functional/plugin/shared/src/snippets/add.cpp b/src/tests/functional/plugin/shared/src/snippets/add.cpp
index c524a54539f304..f0a1908ef2a9e8 100644
--- a/src/tests/functional/plugin/shared/src/snippets/add.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/add.cpp
@@ -5,6 +5,8 @@
 #include "common_test_utils/common_utils.hpp"
 #include "snippets/add.hpp"
 #include "subgraph_simple.hpp"
+#include "ngraph_functions/builders.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
 
 namespace ov {
 namespace test {
@@ -87,6 +89,38 @@ void AddRollConst::SetUp() {
     setInferenceType(type);
 }
 
+std::string AddSinhPair::getTestCaseName(testing::TestParamInfo<ov::test::snippets::AddParamsPair> obj) {
+    std::vector<ov::Shape> input_shapes;
+    ov::element::Type type;
+    std::string targetDevice;
+    size_t num_nodes, num_subgraphs;
+    std::tie(input_shapes, type, num_nodes, num_subgraphs, targetDevice) = obj.param;
+    if (input_shapes.size() != 2)
+        IE_THROW() << "Invalid input shapes vector size";
+    std::ostringstream result;
+    result << "IS[0]=" << CommonTestUtils::vec2str(input_shapes[0]) << "_";
+    result << "IS[1]=" << CommonTestUtils::vec2str(input_shapes[1]) << "_";
+    result << "T=" << type << "_";
+    result << "#N=" << num_nodes << "_";
+    result << "#S=" << num_subgraphs << "_";
+    result << "targetDevice=" << targetDevice;
+    return result.str();
+}
+
+void AddSinhPair::SetUp() {
+    std::vector<ov::Shape> input_shapes;
+    ov::element::Type type;
+    std::tie(input_shapes, type, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
+    std::vector<InputShape> is;
+    for (const auto& s : input_shapes) {
+        is.emplace_back(InputShape {{}, {s, }});
+    }
+    init_input_shapes(is);
+    auto f = ov::test::snippets::AddSinhFunction({input_shapes[0], input_shapes[1]});
+    function = f.getOriginal();
+    setInferenceType(type);
+}
+
 TEST_P(Add, CompareWithRefImpl) {
     run();
     validateNumSubgraphs();
@@ -107,6 +141,10 @@ TEST_P(AddRollConst, CompareWithRefImpl) {
     validateNumSubgraphs();
 }
 
+TEST_P(AddSinhPair, CompareWithRefImpl) {
+    run();
+    validateNumSubgraphs();
+}
 
 } // namespace snippets
 } // namespace test
diff --git a/src/tests/functional/plugin/shared/src/snippets/convert.cpp b/src/tests/functional/plugin/shared/src/snippets/convert.cpp
index b4c5c840cb6869..9f9c343d351ace 100644
--- a/src/tests/functional/plugin/shared/src/snippets/convert.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/convert.cpp
@@ -12,7 +12,7 @@ namespace test {
 namespace snippets {
 
 std::string Convert::getTestCaseName(testing::TestParamInfo<ov::test::snippets::ConvertParams> obj) {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::string targetDevice;
     size_t num_nodes, num_subgraphs;
@@ -21,7 +21,7 @@ std::string Convert::getTestCaseName(testing::TestParamInfo<ov::test::snippets::
     std::ostringstream result;
     result << "IS=";
     for (const auto& sh : inputShape)
-        result << CommonTestUtils::vec2str(sh) << "_";
+        result << CommonTestUtils::vec2str(sh.get_shape()) << "_";
     result << "IT=" << CommonTestUtils::vec2str(types.first) << "_";
     result << "OT=" << CommonTestUtils::vec2str(types.second) << "_";
     result << "#N=" << num_nodes << "_";
@@ -31,11 +31,10 @@ std::string Convert::getTestCaseName(testing::TestParamInfo<ov::test::snippets::
 }
 
 void Convert::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
-
+    init_input_shapes(dynamic_shapes_to_test_representation(inputShape));
     auto f = ov::test::snippets::ConvertFunction(inputShape, types.first[0], types.second[0]);
     function = f.getOriginal();
     output_type = types.second.front();
@@ -85,11 +84,10 @@ void Convert::generate_inputs(const std::vector<ov::Shape>& targetInputStaticSha
 }
 
 void ConvertInput::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
-
+    init_input_shapes(dynamic_shapes_to_test_representation(inputShape));
     auto f = ov::test::snippets::ConvertInputFunction(inputShape, types.first[0], types.second[0]);
     function = f.getOriginal();
 }
@@ -125,10 +123,10 @@ parameters ConvertInput::generate_params_random() const {
 }
 
 void ConvertOutput::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(dynamic_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertOutputFunction(inputShape, types.first[0], types.second[0]);
     function = f.getOriginal();
@@ -136,10 +134,10 @@ void ConvertOutput::SetUp() {
 }
 
 void ConvertStub::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(dynamic_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertStubFunction(inputShape, types.first[0], types.second[0]);
     function = f.getOriginal();
@@ -147,40 +145,40 @@ void ConvertStub::SetUp() {
 }
 
 void ConvertPartialInputsAndResults::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(dynamic_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertPartialInputsAndResultsFunction(inputShape, types.first, types.second);
     function = f.getOriginal();
 }
 
 void ConvertManyOnInputs::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(dynamic_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertManyOnInputsFunction(inputShape, types.first);
     function = f.getOriginal();
 }
 
 void ConvertManyOnOutputs::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(dynamic_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertManyOnOutputsFunction(inputShape, types.first);
     function = f.getOriginal();
 }
 
 void ConvertManyOnInputOutput::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::pair<std::vector<ov::element::Type>, std::vector<ov::element::Type>> types;
     std::tie(inputShape, types, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(dynamic_shapes_to_test_representation(inputShape));
 
     auto f = ov::test::snippets::ConvertManyOnInputOutputFunction(inputShape, types.first, types.second);
     function = f.getOriginal();
diff --git a/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp
index 1140937be63359..221490bb00017c 100644
--- a/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/max_num_params_eltwise.cpp
@@ -27,10 +27,10 @@ std::string MaxNumParamsEltwiseSinh::getTestCaseName(testing::TestParamInfo<ov::
 void MaxNumParamsEltwiseSinh::SetUp() {
     ov::Shape inputShape;
     std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    std::vector<ov::Shape> expandedShapes(10, inputShape);
+    std::vector<ov::PartialShape> expandedShapes(10, inputShape);
     std::vector<InputShape> input_shapes;
     for (const auto& s : expandedShapes) {
-        input_shapes.emplace_back(InputShape {{}, {s, }});
+        input_shapes.emplace_back(InputShape {{}, {s.get_shape(), }});
     }
 
     init_input_shapes(input_shapes);
diff --git a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp
index 276218e6150c57..b2ebed8e6f1ccc 100644
--- a/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/three_inputs_eltwise.cpp
@@ -5,6 +5,7 @@
 #include "common_test_utils/common_utils.hpp"
 #include "snippets/three_inputs_eltwise.hpp"
 #include "subgraph_simple.hpp"
+#include "functional_test_utils/skip_tests_config.hpp"
 
 namespace ov {
 namespace test {
diff --git a/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp
index 205587e1a30f97..2e4ae1b0643adc 100644
--- a/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp
+++ b/src/tests/functional/plugin/shared/src/snippets/two_inputs_and_outputs.cpp
@@ -11,14 +11,14 @@ namespace test {
 namespace snippets {
 
 std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo<ov::test::snippets::TwoInputsAndOutputsParams> obj) {
-    std::vector<ov::Shape> inputShapes;
+    std::vector<ov::PartialShape> inputShapes;
     std::string targetDevice;
     size_t num_nodes, num_subgraphs;
     std::tie(inputShapes, num_nodes, num_subgraphs, targetDevice) = obj.param;
 
     std::ostringstream result;
     for (auto i = 0; i < inputShapes.size(); i++)
-        result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i]) << "_";
+        result << "IS[" << i << "]=" << CommonTestUtils::vec2str(inputShapes[i].get_shape()) << "_";
     result << "#N=" << num_nodes << "_";
     result << "#S=" << num_subgraphs << "_";
     result << "targetDevice=" << targetDevice;
@@ -26,9 +26,9 @@ std::string TwoInputsAndOutputs::getTestCaseName(testing::TestParamInfo<ov::test
 }
 
 void TwoInputsAndOutputs::SetUp() {
-    std::vector<ov::Shape> inputShape;
+    std::vector<ov::PartialShape> inputShape;
     std::tie(inputShape, ref_num_nodes, ref_num_subgraphs, targetDevice) = this->GetParam();
-    init_input_shapes(static_shapes_to_test_representation(inputShape));
+    init_input_shapes(dynamic_shapes_to_test_representation(inputShape));
     auto f = ov::test::snippets::TwoInputsAndOutputsFunction(inputShape);
     function = f.getOriginal();
 }
diff --git a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp
index 2d1038d272b8a9..80ab2ce90f28d5 100644
--- a/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp
+++ b/src/tests/functional/shared_test_classes/include/shared_test_classes/base/ov_subgraph.hpp
@@ -61,6 +61,14 @@ class SubgraphBaseTest : public CommonTestUtils::TestsCommon {
     virtual std::vector<ov::Tensor> get_plugin_outputs();
 };
 
+inline std::vector<InputShape> dynamic_shapes_to_test_representation(const std::vector<ov::PartialShape>& shapes) {
+    std::vector<InputShape> result;
+    for (const auto& staticShape : shapes) {
+        result.push_back({{staticShape}, {staticShape.get_shape()}});
+    }
+    return result;
+}
+
 inline std::vector<std::vector<InputShape>> static_shapes_to_test_representation(const std::vector<std::vector<ov::Shape>>& shapes) {
     std::vector<std::vector<InputShape>> result;
     for (const auto& staticShapes : shapes) {
diff --git a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp
index 237e3b57dd4484..5ab13dafe68dff 100644
--- a/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp
+++ b/src/tests/functional/shared_test_classes/src/base/utils/generate_inputs.cpp
@@ -43,9 +43,15 @@ ov::runtime::Tensor generate(const std::shared_ptr<ov::Node>& node,
 }
 
 namespace Activation {
+// todo: this is a bug fixed! Merge it separately.
+//  Default parameters InputGenerateData(10, 20, 32768, 1) lead to input generation according to 10 + x/32768,
+//  where x {0, 20}, so all generated values are in the range [10, 10 + 6.1e-4].
+//  Thus all the interval more-or-less fall within the uncertainty validation interval
+//  Fix let the range be at least 20x of resolution
 ov::runtime::Tensor generate(const ov::element::Type& elemType,
                              const ov::Shape& targetShape,
-                             InputGenerateData inGenData = InputGenerateData(10, 20, 32768, 1)) {
+//                             InputGenerateData inGenData = InputGenerateData(10, 20, 32768, 1)) {
+                             InputGenerateData inGenData = InputGenerateData(-1, 2*32768, 32768, 1)) {
     if (!elemType.is_signed()) {
         inGenData.range = 15;
         inGenData.start_from = 0;
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp
index 505829b9fd20dd..08e3dfc9c859e8 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/snippets_helpers.hpp
@@ -18,7 +18,7 @@ class SnippetsFunctionBase {
 public:
     SnippetsFunctionBase() = delete;
 
-    explicit SnippetsFunctionBase(const std::vector<Shape>& inputShapes, ov::element::Type_t precision = element::f32)
+    explicit SnippetsFunctionBase(const std::vector<PartialShape>& inputShapes, ov::element::Type_t precision = element::f32)
                 : input_shapes{inputShapes}, precision{precision} {};
 
     std::shared_ptr<Model> getReference() const {
@@ -53,7 +53,7 @@ class SnippetsFunctionBase {
     }
 
     const ov::element::Type_t precision;
-    const std::vector<Shape> input_shapes;
+    const std::vector<PartialShape> input_shapes;
 
     virtual void validate_function(const std::shared_ptr<Model> &f) const;
 };
@@ -67,7 +67,7 @@ class SnippetsFunctionBase {
 class SnippetsFunctionCustomizable : public SnippetsFunctionBase {
 public:
     SnippetsFunctionCustomizable() = delete;
-    SnippetsFunctionCustomizable(const std::vector<Shape>& inputShapes,
+    SnippetsFunctionCustomizable(const std::vector<PartialShape>& inputShapes,
                                  const std::vector<std::shared_ptr<Node>>& customOps,
                                  const std::vector<size_t>&& customOpsNumInputs);
 
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp
index a7c6bd34e0f58e..526234409b348e 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_converts.hpp
@@ -22,7 +22,7 @@ namespace snippets {
 //   Result
 class ConvertFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertFunction(const std::vector<PartialShape>& inputShapes,
                              const ov::element::Type inType = ov::element::f32,
                              const ov::element::Type outType = ov::element::u8)
     : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
@@ -45,7 +45,7 @@ class ConvertFunction : public SnippetsFunctionBase {
 //      Result
 class ConvertInputFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertInputFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertInputFunction(const std::vector<PartialShape>& inputShapes,
                                   const ov::element::Type inType = ov::element::f32,
                                   const ov::element::Type outType = ov::element::u8)
     : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
@@ -67,7 +67,7 @@ class ConvertInputFunction : public SnippetsFunctionBase {
 //      Result
 class ConvertOutputFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertOutputFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertOutputFunction(const std::vector<PartialShape>& inputShapes,
                                    const ov::element::Type inType = ov::element::f32,
                                    const ov::element::Type outType = ov::element::i8)
     : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
@@ -91,7 +91,7 @@ class ConvertOutputFunction : public SnippetsFunctionBase {
 //      Result                Result
 class ConvertStubFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertStubFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertStubFunction(const std::vector<PartialShape>& inputShapes,
                                  const ov::element::Type inType = ov::element::f32,
                                  const ov::element::Type outType = ov::element::i8)
         : SnippetsFunctionBase(inputShapes), inType(inType), outType(outType) {
@@ -117,7 +117,7 @@ class ConvertStubFunction : public SnippetsFunctionBase {
 //            Result2
 class ConvertPartialInputsAndResultsFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertPartialInputsAndResultsFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertPartialInputsAndResultsFunction(const std::vector<PartialShape>& inputShapes,
                                                     const std::vector<ov::element::Type>& inTypes = {ov::element::f32},
                                                     const std::vector<ov::element::Type>& outTypes = {ov::element::f32})
     : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) {
@@ -142,7 +142,7 @@ class ConvertPartialInputsAndResultsFunction : public SnippetsFunctionBase {
 //  Result
 class ConvertManyOnInputsFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertManyOnInputsFunction(const std::vector<Shape>& inputShapes, const std::vector<ov::element::Type>& types)
+    explicit ConvertManyOnInputsFunction(const std::vector<ov::PartialShape>& inputShapes, const std::vector<ov::element::Type>& types)
     : SnippetsFunctionBase(inputShapes), types(types) {
         NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
         NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types");
@@ -165,7 +165,7 @@ class ConvertManyOnInputsFunction : public SnippetsFunctionBase {
 //  Result        Result
 class ConvertManyOnOutputsFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertManyOnOutputsFunction(const std::vector<Shape>& inputShapes, const std::vector<ov::element::Type>& types)
+    explicit ConvertManyOnOutputsFunction(const std::vector<ov::PartialShape>& inputShapes, const std::vector<ov::element::Type>& types)
     : SnippetsFunctionBase(inputShapes), types(types) {
         NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
         NGRAPH_CHECK(types.size() > 1, "Got invalid number of element types");
@@ -191,7 +191,7 @@ class ConvertManyOnOutputsFunction : public SnippetsFunctionBase {
 //  Result        Result
 class ConvertManyOnInputOutputFunction : public SnippetsFunctionBase {
 public:
-    explicit ConvertManyOnInputOutputFunction(const std::vector<Shape>& inputShapes,
+    explicit ConvertManyOnInputOutputFunction(const std::vector<ov::PartialShape>& inputShapes,
                                               const std::vector<ov::element::Type>& inTypes,
                                               const std::vector<ov::element::Type>& outTypes)
     : SnippetsFunctionBase(inputShapes), inTypes(inTypes), outTypes(outTypes) {
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp
index b663c22671f30a..3cbcfdac4a5af6 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_customizable.hpp
@@ -28,7 +28,7 @@ namespace snippets {
 //          Result
 class ConvMulActivationFunction : public SnippetsFunctionCustomizable {
 public:
-    explicit ConvMulActivationFunction(const std::vector<Shape>& inputShapes, const std::vector<std::shared_ptr<Node>>& customOps)
+    explicit ConvMulActivationFunction(const std::vector<PartialShape>& inputShapes, const std::vector<std::shared_ptr<Node>>& customOps)
             : SnippetsFunctionCustomizable(inputShapes, customOps, {2, 1, 1}) {
             NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
             NGRAPH_CHECK(input_shapes[0].size() == 4, "Only 4D input shapes are currently supported");
@@ -36,6 +36,7 @@ class ConvMulActivationFunction : public SnippetsFunctionCustomizable {
                          ov::op::util::is_unary_elementwise_arithmetic(customOps[1]) &&
                          ov::op::util::is_unary_elementwise_arithmetic(customOps[2]),
                          "Got invalid custom ops: expected binary and two unary operations");
+            NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static(), "This test supports only static shapes");
     }
 private:
     std::shared_ptr<ov::Model> initOriginal() const override;
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
index fad086acf031e1..69027e96452751 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_lowered.hpp
@@ -22,7 +22,7 @@ namespace snippets {
 
 class AddFunctionLoweredBroadcast : public AddFunction {
 public:
-    explicit AddFunctionLoweredBroadcast(const std::vector<Shape>& inputShapes, const std::vector<Shape>& broadcastShapes) :
+    explicit AddFunctionLoweredBroadcast(const std::vector<PartialShape>& inputShapes, const std::vector<Shape>& broadcastShapes) :
         AddFunction(inputShapes), broadcast_shapes{broadcastShapes} {
         NGRAPH_CHECK(input_shapes.size() == broadcast_shapes.size(),
                      "Broadcast shapes should have the same size as input_shapes");
@@ -37,10 +37,12 @@ class AddFunctionLoweredBroadcast : public AddFunction {
 
 class EltwiseThreeInputsLoweredFunction : public EltwiseThreeInputsFunction {
 public:
-    explicit EltwiseThreeInputsLoweredFunction(const std::vector<Shape>& inputShapes, const std::vector<Shape>& broadcastShapes) :
+    explicit EltwiseThreeInputsLoweredFunction(const std::vector<PartialShape>& inputShapes, const std::vector<Shape>& broadcastShapes) :
             EltwiseThreeInputsFunction(inputShapes), broadcast_shapes{broadcastShapes} {
         NGRAPH_CHECK(input_shapes.size() == broadcast_shapes.size(),
                      "Broadcast shapes should have the same size as input_shapes");
+        NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static() && input_shapes[2].is_static(),
+                     "Broadcast shapes should have the same size as input_shapes");
     }
 
 protected:
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp
index dd9f342d7b388d..a1254dfaa80521 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/include/subgraph_simple.hpp
@@ -22,7 +22,7 @@ namespace snippets {
 //   Result
 class AddFunction : public SnippetsFunctionBase {
 public:
-    explicit AddFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit AddFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
     }
 protected:
@@ -39,7 +39,7 @@ class AddFunction : public SnippetsFunctionBase {
 // todo: remove Sinh once "no subgraph after input" limitation is relaxed
 class AddSinhFunction : public SnippetsFunctionBase {
 public:
-    explicit AddSinhFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit AddSinhFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
     }
 protected:
@@ -54,8 +54,9 @@ class AddSinhFunction : public SnippetsFunctionBase {
 // todo: remove Sinh once "no subgraph after input" limitation is relaxed
 class AddSinhConstFunction : public SnippetsFunctionBase {
 public:
-    explicit AddSinhConstFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit AddSinhConstFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+        NGRAPH_CHECK(input_shapes[0].is_static(), "This test supports only static shapes");
     }
 protected:
     std::shared_ptr<ov::Model> initOriginal() const override;
@@ -71,8 +72,9 @@ class AddSinhConstFunction : public SnippetsFunctionBase {
 // The function is needed to check different input element types (model precision change)
 class AddRollConstFunction : public SnippetsFunctionBase {
 public:
-    explicit AddRollConstFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit AddRollConstFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 1, "Got invalid number of input shapes");
+        NGRAPH_CHECK(input_shapes[0].is_static(), "Only static shapes are supported");
     }
 protected:
     std::shared_ptr<ov::Model> initOriginal() const override;
@@ -87,7 +89,7 @@ class AddRollConstFunction : public SnippetsFunctionBase {
 //   Result
 class EltwiseFunction : public SnippetsFunctionBase {
 public:
-    explicit EltwiseFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit EltwiseFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
     }
 protected:
@@ -102,7 +104,7 @@ class EltwiseFunction : public SnippetsFunctionBase {
 //       Result
 class EltwiseThreeInputsFunction : public SnippetsFunctionBase {
 public:
-    explicit EltwiseThreeInputsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit EltwiseThreeInputsFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes");
     }
 protected:
@@ -113,7 +115,7 @@ class EltwiseThreeInputsFunction : public SnippetsFunctionBase {
 // todo: remove Sinh once "no subgraph after input" limitation is relaxed
 class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase {
 public:
-    explicit EltwiseThreeInputsSinhFunction(const std::vector<Shape>& inputShapes) :
+    explicit EltwiseThreeInputsSinhFunction(const std::vector<PartialShape>& inputShapes) :
         SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 3, "Got invalid number of input shapes");
     }
@@ -131,7 +133,7 @@ class EltwiseThreeInputsSinhFunction : public SnippetsFunctionBase {
 // todo: remove Sinh once "no subgraph after input" limitation is relaxed
 class EltwiseMaxNumParamsSinhFunction : public SnippetsFunctionBase {
 public:
-    explicit EltwiseMaxNumParamsSinhFunction(const std::vector<Shape>& inputShapes) :
+    explicit EltwiseMaxNumParamsSinhFunction(const std::vector<PartialShape>& inputShapes) :
             SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 10, "Got invalid number of input shapes");
     }
@@ -147,7 +149,7 @@ class EltwiseMaxNumParamsSinhFunction : public SnippetsFunctionBase {
 //                     Result
 class MatMulEltwiseBranchesFunction : public SnippetsFunctionBase {
 public:
-    explicit MatMulEltwiseBranchesFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit MatMulEltwiseBranchesFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
             NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
             NGRAPH_CHECK(input_shapes[0].size() == 4 && input_shapes[1].size() == 4,
                          "Only 4D input shapes are currently supported by this test");
@@ -155,6 +157,7 @@ class MatMulEltwiseBranchesFunction : public SnippetsFunctionBase {
             //  Note that single-element constant are not supported by the test, since they'll be converted
             //  to snippets::op::Scalar. So a more comlex logics is required to produce reference function.
             NGRAPH_CHECK(input_shapes[0][1] == input_shapes[1][1], "Channel dimensions must be equal and != 1");
+            NGRAPH_CHECK(input_shapes[0].is_static() && input_shapes[1].is_static(), "This test supports only static shapes");
     }
 
 protected:
@@ -170,7 +173,7 @@ class MatMulEltwiseBranchesFunction : public SnippetsFunctionBase {
 //       Result
 class EltwiseLogLoopFunction : public SnippetsFunctionBase {
 public:
-    explicit EltwiseLogLoopFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit EltwiseLogLoopFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
             NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
     }
 protected:
@@ -188,7 +191,7 @@ class EltwiseLogLoopFunction : public SnippetsFunctionBase {
 //  Result
 class EltwiseTwoResultsFunction : public SnippetsFunctionBase {
 public:
-    explicit EltwiseTwoResultsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit EltwiseTwoResultsFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
             NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
     }
 protected:
@@ -206,7 +209,7 @@ class EltwiseTwoResultsFunction : public SnippetsFunctionBase {
 //             Result
 class TwoInputsAndOutputsFunction : public SnippetsFunctionBase {
 public:
-    explicit TwoInputsAndOutputsFunction(const std::vector<Shape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
+    explicit TwoInputsAndOutputsFunction(const std::vector<PartialShape>& inputShapes) : SnippetsFunctionBase(inputShapes) {
         NGRAPH_CHECK(input_shapes.size() == 2, "Got invalid number of input shapes");
     }
 protected:
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp
index ff7cdc986a59b1..8cec4a4aca95f6 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/snippets_helpers.cpp
@@ -15,11 +15,11 @@ void SnippetsFunctionBase::validate_function(const std::shared_ptr<Model> &f) co
     NGRAPH_CHECK(params.size() == input_shapes.size(),
                  "Passed input shapes and produced function are inconsistent.");
     for (size_t i = 0; i < input_shapes.size(); i++)
-        NGRAPH_CHECK(std::equal(input_shapes[i].begin(), input_shapes[i].end(), params[i]->get_shape().begin()),
+        NGRAPH_CHECK(std::equal(input_shapes[i].begin(), input_shapes[i].end(), params[i]->get_partial_shape().begin()),
                      "Passed input shapes and produced function are inconsistent.");
 }
 
-SnippetsFunctionCustomizable::SnippetsFunctionCustomizable(const std::vector<Shape>& inputShapes,
+SnippetsFunctionCustomizable::SnippetsFunctionCustomizable(const std::vector<PartialShape>& inputShapes,
                                                            const std::vector<std::shared_ptr<Node>>& customOps,
                                                            const std::vector<size_t>&& customOpsNumInputs)
         : SnippetsFunctionBase(inputShapes), custom_ops{customOps}, custom_ops_num_inputs{customOpsNumInputs} {
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp
index ccf1ce4081e204..9975f5185c1b61 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_customizable.cpp
@@ -13,7 +13,7 @@ namespace snippets {
 
 std::shared_ptr<ov::Model> ConvMulActivationFunction::initOriginal() const {
     auto conv_param = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
-    const auto channels = input_shapes[0][1];
+    const auto channels = static_cast<size_t>(input_shapes[0][1].get_length());
     ngraph::Shape strides(2, 1);
     std::vector<ptrdiff_t> pad_begin(2, 1), pad_end(2, 1);
     const Shape const_shape {channels, channels, 3, 3};
@@ -37,7 +37,7 @@ std::shared_ptr<ov::Model> ConvMulActivationFunction::initReference() const {
     auto conv_param = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
     ngraph::Shape strides(2, 1);
     std::vector<ptrdiff_t> pad_begin(2, 1), pad_end(2, 1);
-    const auto channels = input_shapes[0][1];
+    const auto channels = static_cast<size_t>(input_shapes[0][1].get_length());
     const Shape const_shape {channels, channels, 3, 3};
     const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(const_shape), -10., 10.);
     auto weights = std::make_shared<op::v0::Constant>(precision, const_shape, const_values);
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp
index 8fd664b192187d..d04db522a54881 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_lowered.cpp
@@ -6,6 +6,7 @@
 #include "common_test_utils/data_utils.hpp"
 #include <snippets/snippets_isa.hpp>
 #include "ngraph_functions/builders.hpp"
+#include <snippets/op/loop_helpers.hpp>
 
 namespace ov {
 namespace test {
@@ -14,7 +15,7 @@ namespace snippets {
 std::shared_ptr<ov::Model> AddFunctionLoweredBroadcast::initLowered() const {
     auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
     std::shared_ptr<Node> add_input0 = nullptr;
-    if (!broadcast_shapes[0].empty() && broadcast_shapes[0].back() != input_shapes[0].back()) {
+    if (!broadcast_shapes[0].empty() && broadcast_shapes[0].back() != input_shapes[0].rbegin()->get_length()) {
         add_input0 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(data0, broadcast_shapes[0]);
     } else {
         add_input0 = std::make_shared<ngraph::snippets::op::Load>(data0);
@@ -22,18 +23,38 @@ std::shared_ptr<ov::Model> AddFunctionLoweredBroadcast::initLowered() const {
 
     auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
     std::shared_ptr<Node> add_input1 = nullptr;
-    if (!broadcast_shapes[1].empty() && broadcast_shapes[1].back() != input_shapes[1].back()) {
+    if (!broadcast_shapes[1].empty() && broadcast_shapes[1].back() != input_shapes[1].rbegin()->get_length()) {
         add_input1 = std::make_shared<ngraph::snippets::op::BroadcastLoad>(data1, broadcast_shapes[1]);
     } else {
         add_input1 = std::make_shared<ngraph::snippets::op::Load>(data1);
     }
     auto add = std::make_shared<op::v1::Add>(add_input0, add_input1);
     auto store = std::make_shared<ngraph::snippets::op::Store>(add);
-    return std::make_shared<ov::Model>(NodeVector{store}, ParameterVector{data0, data1});
+    ParameterVector input_params {data0, data1};
+    auto model = std::make_shared<ov::Model>(NodeVector{store}, input_params);
+
+    // Create dummy scheduler to pass graph comparison tests
+    // Note that if there is more than one results, they should be reverted
+    ResultVector results({model->get_results()[0]});
+    const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+    std::vector<bool> apply_increments(input_params.size() + results.size(), true);
+    insertLoopEnd(results, inner_loop_begin, 1, 1, 1, apply_increments);
+    auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0,
+                   [](int64_t max_val, const PartialShape& ps) {
+                        return std::max(ps[ps.size() - 2].get_length(), max_val);
+                    });
+    if (outer_WA > 1) {
+        const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+        insertLoopEnd(results, outer_loop_begin, 0, 1, 1, apply_increments);
+    }
+    return model;
 }
 std::shared_ptr<ov::Model> EltwiseThreeInputsLoweredFunction::initLowered() const {
     // todo: implement conversion between std::vector<size_t> and std::vector<Shape>
-    auto input_params = ngraph::builder::makeParams(precision, {input_shapes[0], input_shapes[1], input_shapes[2]});
+    auto input_params = ngraph::builder::makeParams(precision,
+                                                    {input_shapes[0].get_shape(),
+                                                     input_shapes[1].get_shape(),
+                                                     input_shapes[2].get_shape()});
     auto load_or_broadcastload = [&](size_t i) -> std::shared_ptr<Node> {
         // user specified that no broadcasting is required
         if (broadcast_shapes[i].empty()) {
@@ -41,7 +62,7 @@ std::shared_ptr<ov::Model> EltwiseThreeInputsLoweredFunction::initLowered() cons
         // broadcasting is required: could be Load + BroadcastMove or BroiadcastLoad
         } else {
             // The last dim is processed by vector Tile, so BroadcastLoad is required if the last dim being broadcasted
-            if (input_shapes[i].back() == 1 && broadcast_shapes[i].back() != 1) {
+            if (input_shapes[i].rbegin()->get_length() == 1 && broadcast_shapes[i].back() != 1) {
                 return std::make_shared<ngraph::snippets::op::BroadcastLoad>(input_params[i], broadcast_shapes[i]);
             // Todo: Cover this logics with functional tests, Review FakeBroadcast Emitter
             // Broadcasting of other dims is handled by BroadcastMove. Strictly speaking, broadcasting is achieved via
@@ -57,12 +78,6 @@ std::shared_ptr<ov::Model> EltwiseThreeInputsLoweredFunction::initLowered() cons
     const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.);
     auto sub_scalar = std::make_shared<ngraph::snippets::op::Scalar>(precision, Shape{1}, const_values[0]);
     std::shared_ptr<Node> sub_load;
-//  Todo: Uncomment when invalid read in vector tile will be fixed
-//    if (input_shapes[2].back() == 1)
-//        sub_load = std::make_shared<snippets::op::ScalarLoad>(input_params[2]);
-//    else
-//        sub_load = std::make_shared<snippets::op::Load>(input_params[2]);
-//  remove when the code above is enabled:
     sub_load = std::make_shared<ngraph::snippets::op::Load>(input_params[2]);
     auto sub = std::make_shared<op::v1::Subtract>(sub_load, sub_scalar);
     std::shared_ptr<Node> sub_out;
@@ -72,7 +87,23 @@ std::shared_ptr<ov::Model> EltwiseThreeInputsLoweredFunction::initLowered() cons
         sub_out = std::make_shared<ngraph::snippets::op::BroadcastMove>(sub, broadcast_shapes[2]);
     auto mul = std::make_shared<op::v1::Multiply>(add, sub_out);
     auto store = std::make_shared<ngraph::snippets::op::Store>(mul);
-    return std::make_shared<ov::Model>(NodeVector{store}, input_params);
+    auto model = std::make_shared<ov::Model>(NodeVector{store}, input_params);
+
+    // Create dummy scheduler to pass graph comparison tests
+    // Note that if there is more than one results, they should be reverted
+    ResultVector results({model->get_results()[0]});
+    const auto& inner_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+    std::vector<bool> apply_increments(input_params.size() + results.size(), true);
+    const auto& inner_loop_end = insertLoopEnd(results, inner_loop_begin, 1, 1, 1, apply_increments);
+    auto outer_WA = std::accumulate(input_shapes.begin(), input_shapes.end(), 0,
+                                    [](int64_t max_val, const PartialShape& ps) {
+                                        return std::max(ps[ps.size() - 2].get_length(), max_val);
+                                    });
+    if (outer_WA > 1) {
+        const auto& outer_loop_begin = ngraph::snippets::op::insertLoopBegin(input_params);
+        insertLoopEnd(results, outer_loop_begin, 0, 1, 1, apply_increments);
+    }
+    return model;
 }
 }  // namespace snippets
 }  // namespace test
diff --git a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp
index 237e9b717273d4..6fa4648a5548a9 100644
--- a/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp
+++ b/src/tests/ngraph_helpers/snippets_ngraph_functions/src/subgraph_simple.cpp
@@ -47,17 +47,19 @@ std::shared_ptr<ov::Model> AddSinhFunction::initReference() const {
     return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0, data1});
 }
 std::shared_ptr<ov::Model> AddSinhConstFunction::initOriginal() const {
+    Shape static_input_shape = input_shapes[0].get_shape();
     auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
-    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.);
-    auto const_data1 = std::make_shared<op::v0::Constant>(precision, input_shapes[0], const_values);
+    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(static_input_shape), -10., 10.);
+    auto const_data1 = std::make_shared<op::v0::Constant>(precision, static_input_shape, const_values);
     auto sin0 = std::make_shared<ov::op::v0::Sinh>(data0);
     auto add = std::make_shared<op::v1::Add>(sin0, const_data1);
     return std::make_shared<ov::Model>(NodeVector{add}, ParameterVector{data0});
 }
 std::shared_ptr<ov::Model> AddRollConstFunction::initOriginal() const {
-    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
-    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[0]), -10., 10.);
-    auto const_data1 = std::make_shared<op::v0::Constant>(precision, input_shapes[0], const_values);
+    const auto input_shape = input_shapes[0].get_shape();
+    auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shape);
+    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shape), -10., 10.);
+    auto const_data1 = std::make_shared<op::v0::Constant>(precision, input_shape, const_values);
     auto shift = std::make_shared<op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<float>{1});
     auto axes = std::make_shared<op::v0::Constant>(ov::element::i32, ov::Shape{1}, std::vector<float>{0});
     auto roll0 = std::make_shared<ov::op::v7::Roll>(data0, shift, axes);
@@ -70,7 +72,7 @@ std::shared_ptr<ov::Model> AddRollConstFunction::initOriginal() const {
 std::shared_ptr<ov::Model> EltwiseFunction::initOriginal() const {
     auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
     auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
-    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[1]), -10., 10.);
+    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.);
     auto const_data = std::make_shared<op::v0::Constant>(precision, data1->get_shape(), const_values);
     auto add = std::make_shared<op::v1::Add>(data0, data1);
     auto sub = std::make_shared<op::v1::Subtract>(add, const_data);
@@ -80,7 +82,7 @@ std::shared_ptr<ov::Model> EltwiseFunction::initOriginal() const {
 std::shared_ptr<ov::Model> EltwiseFunction::initReference() const {
     auto data0 = std::make_shared<op::v0::Parameter>(precision, input_shapes[0]);
     auto data1 = std::make_shared<op::v0::Parameter>(precision, input_shapes[1]);
-    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(shape_size(input_shapes[1]), -10., 10.);
+    const std::vector<float> const_values = CommonTestUtils::generate_float_numbers(1, -10., 10.);
     auto const_data = std::make_shared<op::v0::Constant>(precision, data1->get_shape(), const_values);
     auto indata0 = std::make_shared<op::v0::Parameter>(precision, data0->get_shape());
     auto indata1 = std::make_shared<op::v0::Parameter>(precision, data1->get_shape());
@@ -177,8 +179,8 @@ std::shared_ptr<ov::Model> MatMulEltwiseBranchesFunction::initReference() const
     auto sub_const_2 = std::make_shared<ngraph::snippets::op::Scalar>(precision, Shape{1}, const_values[3]);
 
     // snippet function
-    Shape matMulOutShape = input_shapes[0];
-    matMulOutShape.back() = input_shapes[1].back();
+    Shape matMulOutShape = input_shapes[0].get_shape();
+    matMulOutShape.back() = input_shapes[1].get_shape().back();
     auto snippet_input = std::make_shared<op::v0::Parameter>(precision, matMulOutShape);
 
     auto mul_1 = std::make_shared<op::v1::Multiply>(snippet_input, mul_const_1);