From dbd2e51fa32034da38cfc31aba628363508e26f4 Mon Sep 17 00:00:00 2001
From: "Agarwal, Udit" <udit.agarwal@intel.com>
Date: Fri, 13 Jan 2023 10:25:25 -0800
Subject: [PATCH 1/4] Fix level_zero_sub_sub_device testcase

Updated run commands; replace partition_by_affinity_domain for sub-sub-devices by partition_by_cslice
---
 SYCL/Plugin/level_zero_sub_sub_device.cpp | 139 ++++++++--------------
 1 file changed, 49 insertions(+), 90 deletions(-)
diff --git a/SYCL/Plugin/level_zero_sub_sub_device.cpp b/SYCL/Plugin/level_zero_sub_sub_device.cpp
index f365483391..994d769932 100644
--- a/SYCL/Plugin/level_zero_sub_sub_device.cpp
+++ b/SYCL/Plugin/level_zero_sub_sub_device.cpp
@@ -1,8 +1,8 @@
 // REQUIRES:  gpu-intel-pvc, level_zero
 
 // RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.out
-// RUN: env ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: env ZE_DEBUG=1 env ZEX_NUMBER_OF_CCS=0:4  %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
+// RUN: env ZEX_NUMBER_OF_CCS=0:4 %GPU_RUN_PLACEHOLDER %t.out
 
 // Check that queues created on sub-sub-devices are going to specific compute
 // engines:
@@ -25,89 +25,66 @@ using namespace std::chrono;
 #define INTER_NUM (150)
 #define KERNEL_NUM (2000)
 
-void run(std::vector<queue> &queues) {
+void make_queue_and_run_workload(std::vector<device> &subdevices) {
+
+  std::cout << "[important] create " << subdevices.size() <<
+               " sycl queues, one for each sub-sub device" << std::endl;
+
   auto N = 1024 * 16;
   size_t global_range = 1024;
   size_t local_range = 16;
 
-  float *buffer_host0 = malloc_host<float>(N, queues[0]);
-  float *buffer_device0 = malloc_device<float>(N, queues[0]);
+  std::vector<queue> queues;
+  std::vector<float*> buffer_hosts;
+  std::vector<float*> buffer_devices;
 
-  float *buffer_host1 = malloc_host<float>(N, queues[1]);
-  float *buffer_device1 = malloc_device<float>(N, queues[1]);
+  // Create queues for each subdevice
+  for (auto subdevice : subdevices) {
+      queue temp(subdevice, {property::queue::enable_profiling(),
+                             property::queue::in_order()});
+      float* buffer_host = malloc_host<float>(N, temp);
+      float* buffer_device = malloc_device<float>(N, temp);
 
-  float *buffer_host2 = malloc_host<float>(N, queues[2]);
-  float *buffer_device2 = malloc_device<float>(N, queues[2]);
+      for (int i = 0; i < N; ++i) {
+        buffer_host[i] = static_cast<float>(random_float());
+      }
 
-  float *buffer_host3 = malloc_host<float>(N, queues[3]);
-  float *buffer_device3 = malloc_device<float>(N, queues[3]);
+      temp.memcpy(buffer_device, buffer_host, N * sizeof(float)).wait();
 
-  for (int i = 0; i < N; ++i) {
-    buffer_host0[i] = static_cast<float>(random_float());
-    buffer_host1[i] = static_cast<float>(random_float());
-    buffer_host2[i] = static_cast<float>(random_float());
-    buffer_host3[i] = static_cast<float>(random_float());
+      buffer_hosts.push_back(buffer_host);
+      buffer_devices.push_back(buffer_device);
+      queues.push_back(temp);
   }
 
-  queues[0].memcpy(buffer_device0, buffer_host0, N * sizeof(float)).wait();
-  queues[1].memcpy(buffer_device1, buffer_host1, N * sizeof(float)).wait();
-  queues[2].memcpy(buffer_device2, buffer_host2, N * sizeof(float)).wait();
-  queues[3].memcpy(buffer_device3, buffer_host3, N * sizeof(float)).wait();
 
+  // Run workload
   for (auto m = 0; m < INTER_NUM; ++m) {
     for (int k = 0; k < KERNEL_NUM; ++k) {
-      auto event0 = queues[0].submit([&](handler &h) {
-        h.parallel_for<class kernel0>(
-            nd_range<1>(range<1>{global_range}, range<1>{local_range}),
-            [=](nd_item<1> item) {
-              int i = item.get_global_linear_id();
-              buffer_device0[i] = buffer_device0[i] + float(2.0);
-            });
-      });
-      auto event1 = queues[1].submit([&](handler &h) {
-        h.parallel_for<class kernel1>(
-            nd_range<1>(range<1>{global_range}, range<1>{local_range}),
-            [=](nd_item<1> item) {
-              int i = item.get_global_linear_id();
-              buffer_device1[i] = buffer_device1[i] + float(2.0);
-            });
-      });
-      auto event2 = queues[2].submit([&](handler &h) {
-        h.parallel_for<class kernel2>(
-            nd_range<1>(range<1>{global_range}, range<1>{local_range}),
-            [=](nd_item<1> item) {
-              int i = item.get_global_linear_id();
-              buffer_device2[i] = buffer_device2[i] + float(2.0);
-            });
-      });
-      auto event3 = queues[3].submit([&](handler &h) {
-        h.parallel_for<class kernel3>(
-            nd_range<1>(range<1>{global_range}, range<1>{local_range}),
-            [=](nd_item<1> item) {
-              int i = item.get_global_linear_id();
-              buffer_device3[i] = buffer_device3[i] + float(2.0);
-            });
-      });
+      for (int j = 0; j < queues.size(); j++){
+
+        queue current_queue = queues[j];
+        float* buffer_device = buffer_devices[j];
+
+        auto event0 = current_queue.parallel_for<>(
+          nd_range<1>(range<1>{global_range}, range<1>{local_range}),
+          [=](nd_item<1> item) {
+            int i = item.get_global_linear_id();
+            buffer_device[i] = buffer_device[i] + float(2.0);
+       });
+      }
     }
-    queues[0].wait();
-    queues[1].wait();
-    queues[2].wait();
-    queues[3].wait();
-  }
 
-  free(buffer_host0, queues[0]);
-  free(buffer_device0, queues[0]);
-
-  free(buffer_host1, queues[1]);
-  free(buffer_device1, queues[1]);
-
-  free(buffer_host2, queues[2]);
-  free(buffer_device2, queues[2]);
+    for (auto q : queues)
+      q.wait();
+  }
 
-  free(buffer_host3, queues[3]);
-  free(buffer_device3, queues[3]);
+  // De-allocate buffer hosts and devices
+  for (int j = 0; j < queues.size(); j++) {
+    sycl::free(buffer_devices[j], queues[j]);
+    sycl::free(buffer_hosts[j], queues[j]);
+  }
 
-  std::cout << "[info] Finish all" << std::endl;
+  std::cout << "[info] Finish running workload" << std::endl;
 }
 
 int main(void) {
@@ -121,15 +98,15 @@ int main(void) {
 
   // watch out device here
   auto subdevices =
-      devices[1]
+      devices[0]
           .create_sub_devices<
               info::partition_property::partition_by_affinity_domain>(
               info::partition_affinity_domain::next_partitionable);
   std::cout << "[info] sub device size = " << subdevices.size() << std::endl;
   for (auto &subdev : subdevices) {
     auto subsubdevices = subdev.create_sub_devices<
-        info::partition_property::partition_by_affinity_domain>(
-        info::partition_affinity_domain::next_partitionable);
+        info::partition_property::ext_intel_partition_by_cslice>();
+
     std::cout << "[info] sub-sub device size = " << subsubdevices.size()
               << std::endl;
     for (auto &subsubdev : subsubdevices) {
@@ -139,26 +116,8 @@ int main(void) {
 
   std::cout << "[info] all sub-sub devices count: " << subsub.size()
             << std::endl;
-  std::cout << "[important] create 4 sycl queues on first 4 sub-sub devices"
-            << std::endl;
-
-  queue q0(subsub[0],
-           {property::queue::enable_profiling(), property::queue::in_order()});
-  queue q1(subsub[1],
-           {property::queue::enable_profiling(), property::queue::in_order()});
-  queue q2(subsub[2],
-           {property::queue::enable_profiling(), property::queue::in_order()});
-  queue q3(subsub[4],
-           {property::queue::enable_profiling(), property::queue::in_order()});
-
-  std::vector<queue> queues;
-
-  queues.push_back(std::move(q0));
-  queues.push_back(std::move(q1));
-  queues.push_back(std::move(q2));
-  queues.push_back(std::move(q3));
 
-  run(queues);
+  make_queue_and_run_workload(subsub);
 
   return 0;
 }

From f526f302159aa188bcae940c7072af9631c8751a Mon Sep 17 00:00:00 2001
From: "Agarwal, Udit" <udit.agarwal@intel.com>
Date: Fri, 13 Jan 2023 13:38:04 -0800
Subject: [PATCH 2/4] Fix formatting

---
 SYCL/Plugin/level_zero_sub_sub_device.cpp | 45 +++++++++++------------
 1 file changed, 22 insertions(+), 23 deletions(-)

diff --git a/SYCL/Plugin/level_zero_sub_sub_device.cpp b/SYCL/Plugin/level_zero_sub_sub_device.cpp
index 994d769932..dc9fd74075 100644
--- a/SYCL/Plugin/level_zero_sub_sub_device.cpp
+++ b/SYCL/Plugin/level_zero_sub_sub_device.cpp
@@ -27,50 +27,49 @@ using namespace std::chrono;
 
 void make_queue_and_run_workload(std::vector<device> &subdevices) {
 
-  std::cout << "[important] create " << subdevices.size() <<
-               " sycl queues, one for each sub-sub device" << std::endl;
+  std::cout << "[important] create " << subdevices.size()
+            << " sycl queues, one for each sub-sub device" << std::endl;
 
   auto N = 1024 * 16;
   size_t global_range = 1024;
   size_t local_range = 16;
 
   std::vector<queue> queues;
-  std::vector<float*> buffer_hosts;
-  std::vector<float*> buffer_devices;
+  std::vector<float *> buffer_hosts;
+  std::vector<float *> buffer_devices;
 
   // Create queues for each subdevice
   for (auto subdevice : subdevices) {
-      queue temp(subdevice, {property::queue::enable_profiling(),
-                             property::queue::in_order()});
-      float* buffer_host = malloc_host<float>(N, temp);
-      float* buffer_device = malloc_device<float>(N, temp);
+    queue temp(subdevice, {property::queue::enable_profiling(),
+                           property::queue::in_order()});
+    float *buffer_host = malloc_host<float>(N, temp);
+    float *buffer_device = malloc_device<float>(N, temp);
 
-      for (int i = 0; i < N; ++i) {
-        buffer_host[i] = static_cast<float>(random_float());
-      }
+    for (int i = 0; i < N; ++i) {
+      buffer_host[i] = static_cast<float>(random_float());
+    }
 
-      temp.memcpy(buffer_device, buffer_host, N * sizeof(float)).wait();
+    temp.memcpy(buffer_device, buffer_host, N * sizeof(float)).wait();
 
-      buffer_hosts.push_back(buffer_host);
-      buffer_devices.push_back(buffer_device);
-      queues.push_back(temp);
+    buffer_hosts.push_back(buffer_host);
+    buffer_devices.push_back(buffer_device);
+    queues.push_back(temp);
   }
 
-
   // Run workload
   for (auto m = 0; m < INTER_NUM; ++m) {
     for (int k = 0; k < KERNEL_NUM; ++k) {
-      for (int j = 0; j < queues.size(); j++){
+      for (int j = 0; j < queues.size(); j++) {
 
         queue current_queue = queues[j];
-        float* buffer_device = buffer_devices[j];
+        float *buffer_device = buffer_devices[j];
 
         auto event0 = current_queue.parallel_for<>(
-          nd_range<1>(range<1>{global_range}, range<1>{local_range}),
-          [=](nd_item<1> item) {
-            int i = item.get_global_linear_id();
-            buffer_device[i] = buffer_device[i] + float(2.0);
-       });
+            nd_range<1>(range<1>{global_range}, range<1>{local_range}),
+            [=](nd_item<1> item) {
+              int i = item.get_global_linear_id();
+              buffer_device[i] = buffer_device[i] + float(2.0);
+            });
       }
     }
 

From 1a4b3316406d58380e12866d9482eeab5424f896 Mon Sep 17 00:00:00 2001
From: "Agarwal, Udit" <udit.agarwal@intel.com>
Date: Fri, 13 Jan 2023 16:20:56 -0800
Subject: [PATCH 3/4] [SYCL] Used more appropiate variable names. Minor
 Formatting changes.

---
 SYCL/Plugin/level_zero_sub_sub_device.cpp | 51 ++++++++++-------------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/SYCL/Plugin/level_zero_sub_sub_device.cpp b/SYCL/Plugin/level_zero_sub_sub_device.cpp
index dc9fd74075..805d153500 100644
--- a/SYCL/Plugin/level_zero_sub_sub_device.cpp
+++ b/SYCL/Plugin/level_zero_sub_sub_device.cpp
@@ -25,9 +25,8 @@ using namespace std::chrono;
 #define INTER_NUM (150)
 #define KERNEL_NUM (2000)
 
-void make_queue_and_run_workload(std::vector<device> &subdevices) {
-
-  std::cout << "[important] create " << subdevices.size()
+void make_queue_and_run_workload(std::vector<device> &subsubdevices) {
+  std::cout << "[important] create " << subsubdevices.size()
             << " sycl queues, one for each sub-sub device" << std::endl;
 
   auto N = 1024 * 16;
@@ -35,40 +34,39 @@ void make_queue_and_run_workload(std::vector<device> &subdevices) {
   size_t local_range = 16;
 
   std::vector<queue> queues;
-  std::vector<float *> buffer_hosts;
-  std::vector<float *> buffer_devices;
+  std::vector<float *> host_mem_ptrs;
+  std::vector<float *> device_mem_ptrs;
 
-  // Create queues for each subdevice
-  for (auto subdevice : subdevices) {
-    queue temp(subdevice, {property::queue::enable_profiling(),
+  // Create queues for each subdevice.
+  for (auto &ccs : subsubdevices) {
+    queue q(ccs, {property::queue::enable_profiling(),
                            property::queue::in_order()});
-    float *buffer_host = malloc_host<float>(N, temp);
-    float *buffer_device = malloc_device<float>(N, temp);
+    auto *host_mem_ptr = malloc_host<float>(N, q);
+    auto *device_mem_ptr = malloc_device<float>(N, q);
 
     for (int i = 0; i < N; ++i) {
-      buffer_host[i] = static_cast<float>(random_float());
+      host_mem_ptr[i] = static_cast<float>(random_float());
     }
 
-    temp.memcpy(buffer_device, buffer_host, N * sizeof(float)).wait();
+    q.memcpy(device_mem_ptr, host_mem_ptr, N * sizeof(float)).wait();
 
-    buffer_hosts.push_back(buffer_host);
-    buffer_devices.push_back(buffer_device);
-    queues.push_back(temp);
+    host_mem_ptrs.push_back(host_mem_ptr);
+    device_mem_ptrs.push_back(device_mem_ptr);
+    queues.push_back(q);
   }
 
-  // Run workload
+  // Run workload.
   for (auto m = 0; m < INTER_NUM; ++m) {
     for (int k = 0; k < KERNEL_NUM; ++k) {
       for (int j = 0; j < queues.size(); j++) {
-
         queue current_queue = queues[j];
-        float *buffer_device = buffer_devices[j];
+        float *device_mem_ptr = device_mem_ptrs[j];
 
         auto event0 = current_queue.parallel_for<>(
             nd_range<1>(range<1>{global_range}, range<1>{local_range}),
             [=](nd_item<1> item) {
               int i = item.get_global_linear_id();
-              buffer_device[i] = buffer_device[i] + float(2.0);
+              device_mem_ptr[i] = device_mem_ptr[i] + float(2.0);
             });
       }
     }
@@ -77,10 +75,9 @@ void make_queue_and_run_workload(std::vector<device> &subdevices) {
       q.wait();
   }
 
-  // De-allocate buffer hosts and devices
   for (int j = 0; j < queues.size(); j++) {
-    sycl::free(buffer_devices[j], queues[j]);
-    sycl::free(buffer_hosts[j], queues[j]);
+    sycl::free(device_mem_ptrs[j], queues[j]);
+    sycl::free(host_mem_ptrs[j], queues[j]);
   }
 
   std::cout << "[info] Finish running workload" << std::endl;
@@ -92,15 +89,13 @@ int main(void) {
             << std::endl;
   std::vector<device> subsub;
 
-  auto devices = device::get_devices(info::device_type::gpu);
-  std::cout << "[info] device count = " << devices.size() << std::endl;
+  device d;
 
   // watch out device here
   auto subdevices =
-      devices[0]
-          .create_sub_devices<
-              info::partition_property::partition_by_affinity_domain>(
-              info::partition_affinity_domain::next_partitionable);
+       d.create_sub_devices<
+           info::partition_property::partition_by_affinity_domain>(
+           info::partition_affinity_domain::next_partitionable);
   std::cout << "[info] sub device size = " << subdevices.size() << std::endl;
   for (auto &subdev : subdevices) {
     auto subsubdevices = subdev.create_sub_devices<

From cd76a997c43faf7ffc5be6580c75a8593d4ae9e1 Mon Sep 17 00:00:00 2001
From: "Agarwal, Udit" <udit.agarwal@intel.com>
Date: Fri, 13 Jan 2023 16:22:27 -0800
Subject: [PATCH 4/4] More formatting changes.

---
 SYCL/Plugin/level_zero_sub_sub_device.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/SYCL/Plugin/level_zero_sub_sub_device.cpp b/SYCL/Plugin/level_zero_sub_sub_device.cpp
index 805d153500..920c932be1 100644
--- a/SYCL/Plugin/level_zero_sub_sub_device.cpp
+++ b/SYCL/Plugin/level_zero_sub_sub_device.cpp
@@ -39,8 +39,8 @@ void make_queue_and_run_workload(std::vector<device> &subsubdevices) {
 
   // Create queues for each subdevice.
   for (auto &ccs : subsubdevices) {
-    queue q(ccs, {property::queue::enable_profiling(),
-                           property::queue::in_order()});
+    queue q(ccs,
+            {property::queue::enable_profiling(), property::queue::in_order()});
     auto *host_mem_ptr = malloc_host<float>(N, q);
     auto *device_mem_ptr = malloc_device<float>(N, q);
 
@@ -92,10 +92,9 @@ int main(void) {
   device d;
 
   // watch out device here
-  auto subdevices =
-       d.create_sub_devices<
-           info::partition_property::partition_by_affinity_domain>(
-           info::partition_affinity_domain::next_partitionable);
+  auto subdevices = d.create_sub_devices<
+      info::partition_property::partition_by_affinity_domain>(
+      info::partition_affinity_domain::next_partitionable);
   std::cout << "[info] sub device size = " << subdevices.size() << std::endl;
   for (auto &subdev : subdevices) {
     auto subsubdevices = subdev.create_sub_devices<