From dbd2e51fa32034da38cfc31aba628363508e26f4 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Fri, 13 Jan 2023 10:25:25 -0800 Subject: [PATCH 1/4] Fix level_zero_sub_sub_device testcase Updated run commands; replace partition_by_affinity_domain for sub-sub-devices by partition_by_cslice --- SYCL/Plugin/level_zero_sub_sub_device.cpp | 139 ++++++++-------------- 1 file changed, 49 insertions(+), 90 deletions(-) diff --git a/SYCL/Plugin/level_zero_sub_sub_device.cpp b/SYCL/Plugin/level_zero_sub_sub_device.cpp index f365483391..994d769932 100644 --- a/SYCL/Plugin/level_zero_sub_sub_device.cpp +++ b/SYCL/Plugin/level_zero_sub_sub_device.cpp @@ -1,8 +1,8 @@ // REQUIRES: gpu-intel-pvc, level_zero // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.out -// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER -// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env ZE_DEBUG=1 env ZEX_NUMBER_OF_CCS=0:4 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER +// RUN: env ZEX_NUMBER_OF_CCS=0:4 %GPU_RUN_PLACEHOLDER %t.out // Check that queues created on sub-sub-devices are going to specific compute // engines: @@ -25,89 +25,66 @@ using namespace std::chrono; #define INTER_NUM (150) #define KERNEL_NUM (2000) -void run(std::vector &queues) { +void make_queue_and_run_workload(std::vector &subdevices) { + + std::cout << "[important] create " << subdevices.size() << + " sycl queues, one for each sub-sub device" << std::endl; + auto N = 1024 * 16; size_t global_range = 1024; size_t local_range = 16; - float *buffer_host0 = malloc_host(N, queues[0]); - float *buffer_device0 = malloc_device(N, queues[0]); + std::vector queues; + std::vector buffer_hosts; + std::vector buffer_devices; - float *buffer_host1 = malloc_host(N, queues[1]); - float *buffer_device1 = malloc_device(N, queues[1]); + // Create queues for each subdevice + for (auto subdevice : subdevices) { + queue temp(subdevice, {property::queue::enable_profiling(), + property::queue::in_order()}); + float* buffer_host = malloc_host(N, temp); + float* buffer_device = malloc_device(N, temp); - float *buffer_host2 = malloc_host(N, queues[2]); - float *buffer_device2 = malloc_device(N, queues[2]); + for (int i = 0; i < N; ++i) { + buffer_host[i] = static_cast(random_float()); + } - float *buffer_host3 = malloc_host(N, queues[3]); - float *buffer_device3 = malloc_device(N, queues[3]); + temp.memcpy(buffer_device, buffer_host, N * sizeof(float)).wait(); - for (int i = 0; i < N; ++i) { - buffer_host0[i] = static_cast(random_float()); - buffer_host1[i] = static_cast(random_float()); - buffer_host2[i] = static_cast(random_float()); - buffer_host3[i] = static_cast(random_float()); + buffer_hosts.push_back(buffer_host); + buffer_devices.push_back(buffer_device); + queues.push_back(temp); } - queues[0].memcpy(buffer_device0, buffer_host0, N * sizeof(float)).wait(); - queues[1].memcpy(buffer_device1, buffer_host1, N * sizeof(float)).wait(); - queues[2].memcpy(buffer_device2, buffer_host2, N * sizeof(float)).wait(); - queues[3].memcpy(buffer_device3, buffer_host3, N * sizeof(float)).wait(); + // Run workload for (auto m = 0; m < INTER_NUM; ++m) { for (int k = 0; k < KERNEL_NUM; ++k) { - auto event0 = queues[0].submit([&](handler &h) { - h.parallel_for( - nd_range<1>(range<1>{global_range}, range<1>{local_range}), - [=](nd_item<1> item) { - int i = item.get_global_linear_id(); - buffer_device0[i] = buffer_device0[i] + float(2.0); - }); - }); - auto event1 = queues[1].submit([&](handler &h) { - h.parallel_for( - nd_range<1>(range<1>{global_range}, range<1>{local_range}), - [=](nd_item<1> item) { - int i = item.get_global_linear_id(); - buffer_device1[i] = buffer_device1[i] + float(2.0); - }); - }); - auto event2 = queues[2].submit([&](handler &h) { - h.parallel_for( - nd_range<1>(range<1>{global_range}, range<1>{local_range}), - [=](nd_item<1> item) { - int i = item.get_global_linear_id(); - buffer_device2[i] = buffer_device2[i] + float(2.0); - }); - }); - auto event3 = queues[3].submit([&](handler &h) { - h.parallel_for( - nd_range<1>(range<1>{global_range}, range<1>{local_range}), - [=](nd_item<1> item) { - int i = item.get_global_linear_id(); - buffer_device3[i] = buffer_device3[i] + float(2.0); - }); - }); + for (int j = 0; j < queues.size(); j++){ + + queue current_queue = queues[j]; + float* buffer_device = buffer_devices[j]; + + auto event0 = current_queue.parallel_for<>( + nd_range<1>(range<1>{global_range}, range<1>{local_range}), + [=](nd_item<1> item) { + int i = item.get_global_linear_id(); + buffer_device[i] = buffer_device[i] + float(2.0); + }); + } } - queues[0].wait(); - queues[1].wait(); - queues[2].wait(); - queues[3].wait(); - } - free(buffer_host0, queues[0]); - free(buffer_device0, queues[0]); - - free(buffer_host1, queues[1]); - free(buffer_device1, queues[1]); - - free(buffer_host2, queues[2]); - free(buffer_device2, queues[2]); + for (auto q : queues) + q.wait(); + } - free(buffer_host3, queues[3]); - free(buffer_device3, queues[3]); + // De-allocate buffer hosts and devices + for (int j = 0; j < queues.size(); j++) { + sycl::free(buffer_devices[j], queues[j]); + sycl::free(buffer_hosts[j], queues[j]); + } - std::cout << "[info] Finish all" << std::endl; + std::cout << "[info] Finish running workload" << std::endl; } int main(void) { @@ -121,15 +98,15 @@ int main(void) { // watch out device here auto subdevices = - devices[1] + devices[0] .create_sub_devices< info::partition_property::partition_by_affinity_domain>( info::partition_affinity_domain::next_partitionable); std::cout << "[info] sub device size = " << subdevices.size() << std::endl; for (auto &subdev : subdevices) { auto subsubdevices = subdev.create_sub_devices< - info::partition_property::partition_by_affinity_domain>( - info::partition_affinity_domain::next_partitionable); + info::partition_property::ext_intel_partition_by_cslice>(); + std::cout << "[info] sub-sub device size = " << subsubdevices.size() << std::endl; for (auto &subsubdev : subsubdevices) { @@ -139,26 +116,8 @@ int main(void) { std::cout << "[info] all sub-sub devices count: " << subsub.size() << std::endl; - std::cout << "[important] create 4 sycl queues on first 4 sub-sub devices" - << std::endl; - - queue q0(subsub[0], - {property::queue::enable_profiling(), property::queue::in_order()}); - queue q1(subsub[1], - {property::queue::enable_profiling(), property::queue::in_order()}); - queue q2(subsub[2], - {property::queue::enable_profiling(), property::queue::in_order()}); - queue q3(subsub[4], - {property::queue::enable_profiling(), property::queue::in_order()}); - - std::vector queues; - - queues.push_back(std::move(q0)); - queues.push_back(std::move(q1)); - queues.push_back(std::move(q2)); - queues.push_back(std::move(q3)); - run(queues); + make_queue_and_run_workload(subsub); return 0; } From f526f302159aa188bcae940c7072af9631c8751a Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Fri, 13 Jan 2023 13:38:04 -0800 Subject: [PATCH 2/4] Fix formatting --- SYCL/Plugin/level_zero_sub_sub_device.cpp | 45 +++++++++++------------ 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/SYCL/Plugin/level_zero_sub_sub_device.cpp b/SYCL/Plugin/level_zero_sub_sub_device.cpp index 994d769932..dc9fd74075 100644 --- a/SYCL/Plugin/level_zero_sub_sub_device.cpp +++ b/SYCL/Plugin/level_zero_sub_sub_device.cpp @@ -27,50 +27,49 @@ using namespace std::chrono; void make_queue_and_run_workload(std::vector &subdevices) { - std::cout << "[important] create " << subdevices.size() << - " sycl queues, one for each sub-sub device" << std::endl; + std::cout << "[important] create " << subdevices.size() + << " sycl queues, one for each sub-sub device" << std::endl; auto N = 1024 * 16; size_t global_range = 1024; size_t local_range = 16; std::vector queues; - std::vector buffer_hosts; - std::vector buffer_devices; + std::vector buffer_hosts; + std::vector buffer_devices; // Create queues for each subdevice for (auto subdevice : subdevices) { - queue temp(subdevice, {property::queue::enable_profiling(), - property::queue::in_order()}); - float* buffer_host = malloc_host(N, temp); - float* buffer_device = malloc_device(N, temp); + queue temp(subdevice, {property::queue::enable_profiling(), + property::queue::in_order()}); + float *buffer_host = malloc_host(N, temp); + float *buffer_device = malloc_device(N, temp); - for (int i = 0; i < N; ++i) { - buffer_host[i] = static_cast(random_float()); - } + for (int i = 0; i < N; ++i) { + buffer_host[i] = static_cast(random_float()); + } - temp.memcpy(buffer_device, buffer_host, N * sizeof(float)).wait(); + temp.memcpy(buffer_device, buffer_host, N * sizeof(float)).wait(); - buffer_hosts.push_back(buffer_host); - buffer_devices.push_back(buffer_device); - queues.push_back(temp); + buffer_hosts.push_back(buffer_host); + buffer_devices.push_back(buffer_device); + queues.push_back(temp); } - // Run workload for (auto m = 0; m < INTER_NUM; ++m) { for (int k = 0; k < KERNEL_NUM; ++k) { - for (int j = 0; j < queues.size(); j++){ + for (int j = 0; j < queues.size(); j++) { queue current_queue = queues[j]; - float* buffer_device = buffer_devices[j]; + float *buffer_device = buffer_devices[j]; auto event0 = current_queue.parallel_for<>( - nd_range<1>(range<1>{global_range}, range<1>{local_range}), - [=](nd_item<1> item) { - int i = item.get_global_linear_id(); - buffer_device[i] = buffer_device[i] + float(2.0); - }); + nd_range<1>(range<1>{global_range}, range<1>{local_range}), + [=](nd_item<1> item) { + int i = item.get_global_linear_id(); + buffer_device[i] = buffer_device[i] + float(2.0); + }); } } From 1a4b3316406d58380e12866d9482eeab5424f896 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Fri, 13 Jan 2023 16:20:56 -0800 Subject: [PATCH 3/4] [SYCL] Used more appropiate variable names. Minor Formatting changes. --- SYCL/Plugin/level_zero_sub_sub_device.cpp | 51 ++++++++++------------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/SYCL/Plugin/level_zero_sub_sub_device.cpp b/SYCL/Plugin/level_zero_sub_sub_device.cpp index dc9fd74075..805d153500 100644 --- a/SYCL/Plugin/level_zero_sub_sub_device.cpp +++ b/SYCL/Plugin/level_zero_sub_sub_device.cpp @@ -25,9 +25,8 @@ using namespace std::chrono; #define INTER_NUM (150) #define KERNEL_NUM (2000) -void make_queue_and_run_workload(std::vector &subdevices) { - - std::cout << "[important] create " << subdevices.size() +void make_queue_and_run_workload(std::vector &subsubdevices) { + std::cout << "[important] create " << subsubdevices.size() << " sycl queues, one for each sub-sub device" << std::endl; auto N = 1024 * 16; @@ -35,40 +34,39 @@ void make_queue_and_run_workload(std::vector &subdevices) { size_t local_range = 16; std::vector queues; - std::vector buffer_hosts; - std::vector buffer_devices; + std::vector host_mem_ptrs; + std::vector device_mem_ptrs; - // Create queues for each subdevice - for (auto subdevice : subdevices) { - queue temp(subdevice, {property::queue::enable_profiling(), + // Create queues for each subdevice. + for (auto &ccs : subsubdevices) { + queue q(ccs, {property::queue::enable_profiling(), property::queue::in_order()}); - float *buffer_host = malloc_host(N, temp); - float *buffer_device = malloc_device(N, temp); + auto *host_mem_ptr = malloc_host(N, q); + auto *device_mem_ptr = malloc_device(N, q); for (int i = 0; i < N; ++i) { - buffer_host[i] = static_cast(random_float()); + host_mem_ptr[i] = static_cast(random_float()); } - temp.memcpy(buffer_device, buffer_host, N * sizeof(float)).wait(); + q.memcpy(device_mem_ptr, host_mem_ptr, N * sizeof(float)).wait(); - buffer_hosts.push_back(buffer_host); - buffer_devices.push_back(buffer_device); - queues.push_back(temp); + host_mem_ptrs.push_back(host_mem_ptr); + device_mem_ptrs.push_back(device_mem_ptr); + queues.push_back(q); } - // Run workload + // Run workload. for (auto m = 0; m < INTER_NUM; ++m) { for (int k = 0; k < KERNEL_NUM; ++k) { for (int j = 0; j < queues.size(); j++) { - queue current_queue = queues[j]; - float *buffer_device = buffer_devices[j]; + float *device_mem_ptr = device_mem_ptrs[j]; auto event0 = current_queue.parallel_for<>( nd_range<1>(range<1>{global_range}, range<1>{local_range}), [=](nd_item<1> item) { int i = item.get_global_linear_id(); - buffer_device[i] = buffer_device[i] + float(2.0); + device_mem_ptr[i] = device_mem_ptr[i] + float(2.0); }); } } @@ -77,10 +75,9 @@ void make_queue_and_run_workload(std::vector &subdevices) { q.wait(); } - // De-allocate buffer hosts and devices for (int j = 0; j < queues.size(); j++) { - sycl::free(buffer_devices[j], queues[j]); - sycl::free(buffer_hosts[j], queues[j]); + sycl::free(device_mem_ptrs[j], queues[j]); + sycl::free(host_mem_ptrs[j], queues[j]); } std::cout << "[info] Finish running workload" << std::endl; @@ -92,15 +89,13 @@ int main(void) { << std::endl; std::vector subsub; - auto devices = device::get_devices(info::device_type::gpu); - std::cout << "[info] device count = " << devices.size() << std::endl; + device d; // watch out device here auto subdevices = - devices[0] - .create_sub_devices< - info::partition_property::partition_by_affinity_domain>( - info::partition_affinity_domain::next_partitionable); + d.create_sub_devices< + info::partition_property::partition_by_affinity_domain>( + info::partition_affinity_domain::next_partitionable); std::cout << "[info] sub device size = " << subdevices.size() << std::endl; for (auto &subdev : subdevices) { auto subsubdevices = subdev.create_sub_devices< From cd76a997c43faf7ffc5be6580c75a8593d4ae9e1 Mon Sep 17 00:00:00 2001 From: "Agarwal, Udit" Date: Fri, 13 Jan 2023 16:22:27 -0800 Subject: [PATCH 4/4] More formatting changes. --- SYCL/Plugin/level_zero_sub_sub_device.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/SYCL/Plugin/level_zero_sub_sub_device.cpp b/SYCL/Plugin/level_zero_sub_sub_device.cpp index 805d153500..920c932be1 100644 --- a/SYCL/Plugin/level_zero_sub_sub_device.cpp +++ b/SYCL/Plugin/level_zero_sub_sub_device.cpp @@ -39,8 +39,8 @@ void make_queue_and_run_workload(std::vector &subsubdevices) { // Create queues for each subdevice. for (auto &ccs : subsubdevices) { - queue q(ccs, {property::queue::enable_profiling(), - property::queue::in_order()}); + queue q(ccs, + {property::queue::enable_profiling(), property::queue::in_order()}); auto *host_mem_ptr = malloc_host(N, q); auto *device_mem_ptr = malloc_device(N, q); @@ -92,10 +92,9 @@ int main(void) { device d; // watch out device here - auto subdevices = - d.create_sub_devices< - info::partition_property::partition_by_affinity_domain>( - info::partition_affinity_domain::next_partitionable); + auto subdevices = d.create_sub_devices< + info::partition_property::partition_by_affinity_domain>( + info::partition_affinity_domain::next_partitionable); std::cout << "[info] sub device size = " << subdevices.size() << std::endl; for (auto &subdev : subdevices) { auto subsubdevices = subdev.create_sub_devices<