Skip to content

Commit

Permalink
Post rebase fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
v-Golubev committed Dec 5, 2023
1 parent aedbdf3 commit 68b48a0
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 34 deletions.
15 changes: 4 additions & 11 deletions src/common/snippets/src/lowered/pass/identify_buffers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,19 +74,14 @@ void IdentifyBuffers::update_adj_matrix(const std::pair<ExpressionPtr, ShiftPtrP
}

std::vector<bool> IdentifyBuffers::create_adjacency_matrix(const LinearIR& linear_ir, const BufferPool& pool) {
// There are several sync points for adjacency check:
// 1. Loop because only in Loop we increment pointers. So if some Buffers in the one Loop have conflict
// (cannot be inplace: the different ptr increment and data sizes) they are called as adjacent
// 2. Brgemm because its blocking implementation requires Buffers with unique memory on all inputs and outputs
// The sync point to check for adjacency is Loop because only in Loop we increment pointers.
// So if some Buffers in the one Loop have conflict (cannot be inplace: the different ptr increment and data sizes)
// they are called as adjacent
const auto size = pool.size();
std::vector<bool> adj(size * size, false);
for (size_t i = 0; i < size; ++i)
adj[index(size, i, i)] = true;

auto is_buffer = [](const ExpressionPort& port) {
return ov::is_type<op::Buffer>(port.get_expr()->get_node());
};

for (auto expr_it = linear_ir.cbegin(); expr_it != linear_ir.cend(); expr_it++) {
const auto &expr = *expr_it;
const auto& loop_end = ov::as_type_ptr<op::LoopEnd>(expr->get_node());
Expand Down Expand Up @@ -207,9 +202,7 @@ auto IdentifyBuffers::coloring(BufferPool& buffers, std::vector<bool>& adj) -> s

bool IdentifyBuffers::run(LinearIR& linear_ir) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::IdentifyBuffers")
// Unite Buffers using Graph coloring algorithm.
// Notes: We identify only Buffer with Intermediate memory because Buffers with new memory are used only in Brgemm case
// so these Buffers are always IntermediateBuffer nonadjacent
// Identify Buffers using Graph coloring algorithm.
BufferPool buffer_pool;

for (const auto& expr : linear_ir) {
Expand Down
54 changes: 40 additions & 14 deletions src/common/snippets/src/lowered/pass/insert_buffers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,26 +49,52 @@ ov::Shape compute_allocation_shape(const LinearIR::LoopManagerPtr& loop_manager,
return allocation_shape;
}

auto set_rest_dims_to_ones = [&](const int filled_dims_count) {
for (int i = 0; i < static_cast<int>(allocation_shape.size()) - filled_dims_count; ++i) {
allocation_shape[i] = 1;
// If subtensor is set, its information is used for allocation shape computation. Two situations are possible:
// 1. Buffer is outside the parent loop: the corresponding subtensor value is ignored, parent loop work amount is set instead
// 2. Buffer is inside the parent loop: the corresponding subtensor value is used in allocation shape.
// Since we can defenitely know which subtensor value corresponds to the loop only for 1st case
// (we can extract this info from loop exit port), we copy subtensor, and then replace subtensor values with parent loop work amount if needed.
// Example:
// Parent subtensor: [M_blk, N_blk]
// Buffer loop idces: [M_loop_idx], parent loop idces: [M_loop_idx, N_loop_idx]
//
// 1. Allocation shape is set to subtensor: [M_blk, N_blk]
// 2. Buffer is inside M_loop_idx loop => allocation shape is not changed
// 3. Buffer is outside N_loop_idx loop => the corresponding allocation shape value is replaced with N loop work amount
// So the result allocation shape is [M_blk, N_loop_work_amount]
const auto& subtensor = expr_port.get_descriptor_ptr()->get_subtensor();
if (!subtensor.empty()) {
for (size_t i = 0; i < std::min(rank, subtensor.size()); ++i) {
auto& cur_val = *(allocation_shape.rbegin() + i);
const auto& subtensor_val = *(subtensor.rbegin() + i);
cur_val = std::min(cur_val, subtensor_val);
}
for (const auto& parent_loop : parent_loop_ids) {
if (std::find(buffer_loop_ids.begin(), buffer_loop_ids.end(), parent_loop) == buffer_loop_ids.end()) {
const auto loop_info = loop_manager->get_loop_info(parent_loop);
const auto& exit_points = loop_info->get_exit_points();
auto it = std::find_if(exit_points.begin(),
exit_points.end(),
[&expr_port](const LinearIR::LoopManager::LoopPort& port) {
return *port.expr_port == expr_port;
});
OPENVINO_ASSERT(it != exit_points.end(), "compute_allocation_shape: exit point of parent loop can not be found");
const auto& loop_port = *it;
if (loop_port.is_incremented && loop_port.dim_idx < allocation_shape.size()) {
*(allocation_shape.rbegin() + loop_port.dim_idx) = loop_info->get_work_amount();
}
}
}
};

// In some cases it's possible to allocate less shape
// 1. Buffer and its parent are in the same loop: allocation size for the outer dimension can be extracted from loop increment
// 2. Buffer is outside the parent's loops: allocation size can be extracted from the corresponding loop work amount
// TODO: Use general logic with the help of memory counts for allocation shape computation
if (buffer_loop_ids.back() == parent_loop_ids.back()) {
const auto buffer_loop = loop_manager->get_loop_info(buffer_loop_ids.back());
*(allocation_shape.rbegin() + 1) = buffer_loop->get_increment();
set_rest_dims_to_ones(2);
} else {
// WA: In case of empty subtensors another information have to be used to update allocation shape.
for (size_t i = 0; i < std::min(rank, parent_loop_ids.size()); ++i) {
const auto loop = loop_manager->get_loop_info(*(parent_loop_ids.rbegin() + i));
OPENVINO_ASSERT(loop->get_dim_idx() == i, "compute_allocation_shape: eltwise loop has unexpected dimension index");
*(allocation_shape.rbegin() + i) = loop->get_work_amount();
}
set_rest_dims_to_ones(static_cast<int>(parent_loop_ids.size()));
for (int i = 0; i < allocation_rank - static_cast<int>(parent_loop_ids.size()); ++i) {
allocation_shape[i] = 1;
}
}
return allocation_shape;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,9 @@ std::shared_ptr<ov::Model> EltwiseBufferAllocationTest::GetModel() const {
const auto parameter0 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
const auto parameter1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, ov::PartialShape({1, 3, 100, 100}));
const auto add = std::make_shared<ov::op::v1::Add>(parameter0, parameter1);
const auto buffer0 = std::make_shared<ov::snippets::op::Buffer>(add, static_cast<int32_t>(subtensor_buffer.size()));
const auto buffer0 = std::make_shared<ov::snippets::op::IntermediateMemoryBuffer>(add, static_cast<int32_t>(subtensor_buffer.size()));
const auto relu = std::make_shared<ov::op::v0::Relu>(buffer0);
const auto buffer1 = std::make_shared<ov::snippets::op::Buffer>(relu, static_cast<int32_t>(subtensor_buffer.size()));
const auto buffer1 = std::make_shared<ov::snippets::op::IntermediateMemoryBuffer>(relu, static_cast<int32_t>(subtensor_buffer.size()));
const auto exp = std::make_shared<ov::op::v0::Exp>(buffer1);
const auto body = std::make_shared<ov::Model>(std::make_shared<ov::op::v0::Result>(exp), ov::ParameterVector{parameter0, parameter1});

Expand All @@ -119,7 +119,7 @@ void MHABufferAllocationTest::MarkBrgemm(const std::shared_ptr<ov::snippets::op:
}

std::shared_ptr<ov::Model> MHABufferAllocationTest::GetModel() const {
const auto subtensor_scalar = std::vector<size_t>{1, 1};
const auto subtensor_scalar = std::vector<size_t>{1};
const auto subtensor_eltwise = std::vector<size_t>{1, m_vector_size};
const auto subtensor_brgemm = std::vector<size_t>{32, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
Expand Down Expand Up @@ -187,7 +187,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHA
::testing::Values(true),
::testing::Values(true),
::testing::Values(57344), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
::testing::Values(3)), // (Buffer before brgemm) + (between brgemms) + (after brgemm)
::testing::Values(2)), // (Buffer before brgemm0 and after brgemm1) + (between brgemms)
BufferAllocationTest::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABufferAllocationTest,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,8 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP

void ApplyTransformations(bool is_optimized, bool with_split_loops) {
ov::snippets::lowered::pass::PassPipeline pipeline;
pipeline.register_pass<ov::intel_cpu::pass::BrgemmBlocking>();
pipeline.register_pass<ov::snippets::lowered::pass::MarkLoops>(m_vector_size);
pipeline.register_pass<ov::intel_cpu::pass::BrgemmBlocking>();
pipeline.register_pass<ov::snippets::lowered::pass::SoftmaxDecomposition>(m_vector_size);
pipeline.register_pass<ov::snippets::lowered::pass::FuseLoops>();
if (with_split_loops)
Expand Down Expand Up @@ -120,7 +120,7 @@ class BufferAllocationCPUTest : public testing::TestWithParam<BufferAllocationCP
class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {
protected:
std::shared_ptr<ov::Model> GetModel() const override {
const auto subtensor_scalar = std::vector<size_t>{1, 1};
const auto subtensor_scalar = std::vector<size_t>{1};
const auto subtensor_softmax = std::vector<size_t>{1, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM};
const auto subtensor_full = std::vector<size_t>(2, ov::snippets::lowered::PortDescriptor::ServiceDimensions::FULL_DIM);

Expand All @@ -136,21 +136,25 @@ class MHABF16AMXBufferAllocationTest : public BufferAllocationCPUTest {

const auto brgemm_copyb0 = std::make_shared<ov::intel_cpu::BrgemmCopyB>(
convert1, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0);
const auto scratch0 = std::make_shared<ov::snippets::op::Buffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
const auto scratch0 = std::make_shared<ov::snippets::op::NewMemoryBuffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
const auto brgemm_cpu0 = std::make_shared<ov::intel_cpu::BrgemmCPU>(
parameter0, brgemm_copyb0->output(0), scratch0, ov::intel_cpu::BrgemmCPU::Type::AMX);
brgemm_cpu0->set_m_block_size(32);
brgemm_cpu0->set_k_block_size(16);
brgemm_cpu0->set_n_block_size(64);

const auto relu1 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu0);
const auto softmax = std::make_shared<ov::op::v1::Softmax>(relu1, 3);
const auto convert2 = std::make_shared<ov::snippets::op::ConvertSaturation>(softmax, ov::element::bf16);

const auto brgemm_copyb1 = std::make_shared<ov::intel_cpu::BrgemmCopyB>(
parameter2, ov::element::bf16, ov::intel_cpu::BrgemmCopyB::OnlyRepacking, 0, 0, 0);
const auto scratch1 = std::make_shared<ov::snippets::op::Buffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
const auto scratch1 = std::make_shared<ov::snippets::op::NewMemoryBuffer>(ov::Shape{ov::intel_cpu::BrgemmCPU::SCRATCH_BYTE_SIZE});
const auto brgemm_cpu1 = std::make_shared<ov::intel_cpu::BrgemmCPU>(
convert2, brgemm_copyb1->output(0), scratch1, ov::intel_cpu::BrgemmCPU::Type::AMX);
brgemm_cpu1->set_m_block_size(32);
brgemm_cpu1->set_k_block_size(16);
brgemm_cpu1->set_n_block_size(64);

const auto relu2 = std::make_shared<ov::op::v0::Relu>(brgemm_cpu1);

Expand Down Expand Up @@ -191,7 +195,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHAOptimizedWSplit, MHA
::testing::Values(true),
::testing::Values(true),
::testing::Values(90112),
::testing::Values(4)),
::testing::Values(5)),
BufferAllocationCPUTest::getTestCaseName);

INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHANotOptimizedWOSplit, MHABF16AMXBufferAllocationTest,
Expand Down

0 comments on commit 68b48a0

Please sign in to comment.