From 64564e3d96e1ab85fd77d861fa67093317bd0c79 Mon Sep 17 00:00:00 2001 From: Kelvin Choi Date: Mon, 1 Jul 2024 17:21:16 +0900 Subject: [PATCH] [GPU] Reinterpret from 1 dim mem to 0 dim mem instead of allocating 0 bytes layout to OpenCL --- .../intel_gpu/src/runtime/ocl/ocl_engine.cpp | 26 ++++- .../tests/unit/test_cases/loop_gpu_test.cpp | 105 ++++++++++++++++++ 2 files changed, 127 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp index 4de85d27c06668..759b74ea1e4b2d 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_engine.cpp @@ -174,23 +174,41 @@ memory::ptr ocl_engine::allocate_memory(const layout& layout, allocation_type ty check_allocatable(layout, type); + auto zero_bytes_layout = false; + auto non_zero_layout = layout; + if (layout.bytes_count() == 0) { + cldnn::layout zero_dim_layout = layout; + auto mem_ps = zero_dim_layout.get_partial_shape(); + for (size_t k = 0; k < mem_ps.size(); k++) { + if (mem_ps[k] == 0) + mem_ps[k] = 1; + } + + non_zero_layout = cldnn::layout(mem_ps, zero_dim_layout.data_type, zero_dim_layout.format); + zero_bytes_layout = true; + } + try { memory::ptr res = nullptr; if (layout.format.is_image_2d()) { - res = std::make_shared(this, layout); + res = std::make_shared(this, non_zero_layout); } else if (type == allocation_type::cl_mem) { - res = std::make_shared(this, layout); + res = std::make_shared(this, non_zero_layout); } else { - res = std::make_shared(this, layout, type); + res = std::make_shared(this, non_zero_layout, type); } - if (reset || res->is_memory_reset_needed(layout)) { + if (reset || res->is_memory_reset_needed(non_zero_layout)) { auto ev = res->fill(get_service_stream()); if (ev) { get_service_stream().wait_for_events({ev}); } } + if (zero_bytes_layout) { + res = reinterpret_buffer(*res, layout); + } + return res; } catch (const cl::Error& clErr) { switch (clErr.err()) { diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp index d071c0f3416581..40650b8cbbdf13 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/loop_gpu_test.cpp @@ -1212,3 +1212,108 @@ TEST(loop_gpu, support_loop_w_dynamic_input_update_primitive_id) { std::vector(), 2, 3); } + +template +void test_loop_gpu_zero_bytes_layout(bool is_caching_test) +{ + auto& engine = get_test_engine(); + + // shape for zero bytes layout + auto trip_count_mem = engine.allocate_memory({ cldnn::layout{ ov::PartialShape({0}), data_types::i32, format::bfyx } }); + + auto input_mem = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 4, 5 } }); + auto operand_mem = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 4, 5 } }); + auto initial_condition_mem = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 1, 1, 1 } }); + auto num_iteration_mem = engine.allocate_memory({ data_types::i32, format::bfyx, { 1, 1, 1, 1 } }); + + std::vector input_data{ + 1.0f, 2.0f, -15.f, 3.0f, 4.0f, -15.f, 5.0f, 6.0f, -15.f, 7.0f, + -15.f, 0.0f, 0.0f, -15.f, 0.5f, -0.5f, -15.f, 8.0f, 1.5f, 5.2f + }; + std::vector eltwise_operand { + 1.f, -2.f, 3.f, -4.f, 3.0f, -2.0f, 1.f, -2.f, 3.0f, -4.0f, + 3.f, -2.f, 1.f, -2.f, 3.5f, -4.5f, 5.f, -4.f, 3.5f, -2.2f + }; + int trip_count = 8; + int initial_condition = 1; + + // initialize input buffers + set_values(input_mem, input_data); + set_values(operand_mem, eltwise_operand); + set_values(trip_count_mem, { trip_count }); + set_values(initial_condition_mem, {initial_condition}); + + topology body( + input_layout("input", input_mem->get_layout()), + data("eltwise_operand", operand_mem), + eltwise("eltwise", input_info("input"), input_info("eltwise_operand"), eltwise_mode::sum) + ); + + std::vector input_primitive_maps { loop::io_primitive_map("input", "input") }; + std::vector output_primitive_maps { loop::io_primitive_map("loop", "eltwise") }; + std::vector back_edges { loop::backedge_mapping("eltwise", "input") }; + + auto body_program = build_program(engine, body, "", output_primitive_maps, back_edges); + + topology topology( + input_layout("input", input_mem->get_layout()), + input_layout("trip_count", trip_count_mem->get_layout()), + input_layout("initial_condition", initial_condition_mem->get_layout()), + mutable_data("num_iteration", num_iteration_mem), + loop("loop", { input_info("num_iteration"), input_info("trip_count"), input_info("initial_condition"), input_info("input") }, body_program, + "trip_count", "initial_condition", "num_iteration", + input_primitive_maps, output_primitive_maps, back_edges, 8) + ); + + cldnn::network::ptr network = get_network(engine, topology, get_test_default_config(engine), get_test_stream_ptr(), is_caching_test); + + network->set_input_data("input", input_mem); + network->set_input_data("trip_count", trip_count_mem); + network->set_input_data("initial_condition", initial_condition_mem); + + auto outputs = network->execute(); + ASSERT_EQ(outputs.size(), 1); + auto output = outputs.begin()->second.get_memory(); + auto output_layout = output->get_layout(); + + ASSERT_EQ(output_layout.batch(), 1); + ASSERT_EQ(output_layout.feature(), 1); + ASSERT_EQ(output_layout.spatial(0), 4); + ASSERT_EQ(output_layout.spatial(1), 5); + + // value check + { + mem_lock output_ptr{ output, get_test_stream() }; + ASSERT_EQ(output_ptr.size(), input_data.size()); + for (size_t i = 0, iend = input_data.size(); i < iend; ++i) { + ASSERT_FLOAT_EQ(output_ptr[i], input_data[i] + eltwise_operand[i] * trip_count); + } + } + + // allocate new output memory + layout loop_l = network->get_output_memory("loop")->get_layout(); + auto output_mem = engine.allocate_memory(loop_l); + network->set_output_memory("loop", output_mem); + + //one more execute + set_values(input_mem, input_data); + set_values(operand_mem, eltwise_operand); + set_values(trip_count_mem, { trip_count }); + set_values(initial_condition_mem, { initial_condition }); + outputs = network->execute(); + + // check everything once again + ASSERT_EQ(outputs.size(), 1); + auto output2 = outputs.begin()->second.get_memory(); + { + mem_lock output_ptr2{ output2, get_test_stream() }; + ASSERT_EQ(output_ptr2.size(), input_data.size()); + for (size_t i = 0, iend = input_data.size(); i < iend; ++i) { + ASSERT_FLOAT_EQ(output_ptr2[i], input_data[i] + eltwise_operand[i] * trip_count); + } + } +} + +TEST(loop_gpu, zero_bytes_layout) { + test_loop_gpu_zero_bytes_layout(false); +}