diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll index 5c8fcfdca2385..bf4f6c9def69d 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll @@ -10,30 +10,30 @@ ; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll new file mode 100644 index 0000000000000..06f0cb39c4b38 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll @@ -0,0 +1,133 @@ +; This test emulates two translation units with 3 kernels: +; TU0_kernel0 - 1st translation unit, no reqd_sub_group_size attribute used +; TU0_kernel1 - 1st translation unit, reqd_sub_group_size attribute is used +; TU1_kernel2 - 2nd translation unit, no reqd_sub_group_size attribute used + +; The test is intended to check that sycl-post-link correctly separates kernels +; that use reqd_sub_group_size attributes from kernels which doesn't use them +; regardless of device code split mode + +; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-M1-IR \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.ll --check-prefixes CHECK-M2-IR \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2 +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 + +; Regardless of device code split mode, each kernel should go into a separate +; device image + +; CHECK-M2-IR: define {{.*}} @TU0_kernel0 +; CHECK-M2-SYMS: TU0_kernel0 + +; CHECK-M1-IR: define {{.*}} @TU0_kernel1 +; CHECK-M1-SYMS: TU0_kernel1 + +; CHECK-M0-IR: define {{.*}} @TU1_kernel2 +; CHECK-M0-SYMS: TU1_kernel2 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +; FIXME: device globals should also be properly distributed across device images +; if they are of optional type +@_ZL2GV = internal addrspace(1) constant [1 x i32] [i32 42], align 4 + +define dso_local spir_kernel void @TU0_kernel0() #0 { +entry: + call spir_func void @foo() + ret void +} + +define dso_local spir_func void @foo() { +entry: + %a = alloca i32, align 4 + %call = call spir_func i32 @bar(i32 1) + %add = add nsw i32 2, %call + store i32 %add, i32* %a, align 4 + ret void +} + +; Function Attrs: nounwind +define linkonce_odr dso_local spir_func i32 @bar(i32 %arg) { +entry: + %arg.addr = alloca i32, align 4 + store i32 %arg, i32* %arg.addr, align 4 + %0 = load i32, i32* %arg.addr, align 4 + ret i32 %0 +} + +define dso_local spir_kernel void @TU0_kernel1() #0 !intel_reqd_sub_group_size !2 { +entry: + call spir_func void @foo1() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo1() { +entry: + %a = alloca i32, align 4 + store i32 2, i32* %a, align 4 + ret void +} + +define dso_local spir_kernel void @TU1_kernel2() #1 { +entry: + call spir_func void @foo2() + ret void +} + +; Function Attrs: nounwind +define dso_local spir_func void @foo2() { +entry: + %a = alloca i32, align 4 + %0 = load i32, i32 addrspace(4)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(4)* addrspacecast ([1 x i32] addrspace(1)* @_ZL2GV to [1 x i32] addrspace(4)*), i64 0, i64 0), align 4 + %add = add nsw i32 4, %0 + store i32 %add, i32* %a, align 4 + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!opencl.spir.version = !{!0, !0} +!spirv.Source = !{!1, !1} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{i32 32} diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll new file mode 100644 index 0000000000000..0d0d4da32a526 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll @@ -0,0 +1,59 @@ +; The test is intended to check that sycl-post-link correctly groups kernels +; by unique reqd_sub_group_size values used in them + +; RUN: sycl-post-link -split=auto -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE +; +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-M0-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \ +; RUN: --implicit-check-not kernel2 +; +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 +; +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M1-SYMS \ +; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 \ +; RUN: --implicit-check-not kernel3 + +; CHECK-TABLE: Code +; CHECK-TABLE-NEXT: _0.sym +; CHECK-TABLE-NEXT: _1.sym +; CHECK-TABLE-NEXT: _2.sym +; CHECK-TABLE-EMPTY: + +; CHECK-M0-SYMS: kernel3 + +; CHECK-M1-SYMS: kernel0 + +; CHECK-M2-SYMS: kernel1 +; CHECK-M2-SYMS: kernel2 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define dso_local spir_kernel void @kernel0() #0 !intel_reqd_sub_group_size !1 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel1() #0 !intel_reqd_sub_group_size !2 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel2() #0 !intel_reqd_sub_group_size !3 { +entry: + ret void +} + +define dso_local spir_kernel void @kernel3() #0 !intel_reqd_sub_group_size !4 { +entry: + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } + +!1 = !{i32 32} +!2 = !{i32 64} +!3 = !{i32 64} +!4 = !{i32 16} diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-3.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-3.ll new file mode 100644 index 0000000000000..3193ac113788c --- /dev/null +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-3.ll @@ -0,0 +1,54 @@ +; This test is intended to check that we do not perform per-reqd_sub_group_size +; split if it was disabled through one or another sycl-post-link option + +; RUN: sycl-post-link -symbols -S %s -o %t.table +; RUN: FileCheck %s -input-file=%t.table --check-prefix CHECK-TABLE +; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK-IR +; +; -lower-esimd is needed so sycl-post-link does not complain about no actions +; specified +; RUN: sycl-post-link -lower-esimd -ir-output-only -S %s -o %t.ll +; RUN: FileCheck %s -input-file=%t.ll --check-prefix CHECK-IR + +; We expect to see only one module generated: +; +; CHECK-TABLE: Code +; CHECK-TABLE-NEXT: _0.ll +; CHECK-TABLE-EMPTY: + +; Regardless of used reqd_sub_group_size and sycl-module-id metadata, all +; kernel and functions should still be present. + +; CHECK-IR-DAG: define spir_func void @foo +; CHECK-IR-DAG: define spir_func void @bar +; CHECK-IR-DAG: define spir_kernel void @kernel0 +; CHECK-IR-DAG: define spir_kernel void @kernel1 + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-linux" + +define spir_func void @foo() #0 !intel_reqd_sub_group_size !1 { + ret void +} + +define spir_func void @bar() #1 !intel_reqd_sub_group_size !2 { + ret void +} + +define spir_kernel void @kernel0() #1 !intel_reqd_sub_group_size !2 { +entry: + ret void +} + +define spir_kernel void @kernel1() #0 !intel_reqd_sub_group_size !3 { +entry: + call void @foo() + ret void +} + +attributes #0 = { "sycl-module-id"="TU1.cpp" } +attributes #1 = { "sycl-module-id"="TU2.cpp" } + +!1 = !{i32 32} +!2 = !{i32 64} +!3 = !{i32 16, i32 16} diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll index 02bd44c53dcb4..e3261c297232e 100644 --- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll +++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll @@ -8,12 +8,12 @@ ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1 \ ; RUN: --implicit-check-not kernel2 ; -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M2-SYMS \ -; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 -; -; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M1-SYMS \ +; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-M1-SYMS \ ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2 \ ; RUN: --implicit-check-not kernel3 +; +; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \ +; RUN: --implicit-check-not kernel0 --implicit-check-not kernel3 ; CHECK-TABLE: Code ; CHECK-TABLE-NEXT: _0.sym diff --git a/llvm/test/tools/sycl-post-link/device-requirements/reqd-sub-group-size.ll b/llvm/test/tools/sycl-post-link/device-requirements/reqd-sub-group-size.ll new file mode 100644 index 0000000000000..86e06936da9f2 --- /dev/null +++ b/llvm/test/tools/sycl-post-link/device-requirements/reqd-sub-group-size.ll @@ -0,0 +1,194 @@ +; Original code: +; #include + +; int main() { +; sycl::queue q; +; q.submit([&](sycl::handler &h) { +; h.parallel_for( +; sycl::range<1>(32), +; [=](sycl::item<1> it) [[sycl::reqd_sub_group_size(16)]] {}); +; }); +; q.submit([&](sycl::handler &h) { +; h.parallel_for( +; sycl::range<1>(32), +; [=](sycl::item<1> it) [[sycl::reqd_sub_group_size(32)]] {}); +; }); +; q.submit([&](sycl::handler &h) { +; h.parallel_for( +; sycl::range<1>(32), +; [=](sycl::item<1> it) [[sycl::reqd_sub_group_size(16)]] {}); +; }); +; return 0; +; } + +; RUN: sycl-post-link -split=auto %s -o %t.table +; RUN: FileCheck %s -input-file=%t_0.prop --check-prefix CHECK-PROP-AUTO-SPLIT-0 +; RUN: FileCheck %s -input-file=%t_1.prop --check-prefix CHECK-PROP-AUTO-SPLIT-1 + +; CHECK-PROP-AUTO-SPLIT-0: [SYCL/device requirements] +; CHECK-PROP-AUTO-SPLIT-0: reqd_sub_group_size=2|gAAAAAAAAAAEAAAA + +; CHECK-PROP-AUTO-SPLIT-1: [SYCL/device requirements] +; CHECK-PROP-AUTO-SPLIT-1: reqd_sub_group_size=2|gAAAAAAAAAAIAAAA + +; ModuleID = '/tmp/source-706237.bc' +source_filename = "llvm-link" +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64-unknown-unknown" + +$_ZTSN4sycl3_V16detail19__pf_kernel_wrapperIZZ4mainENKUlRNS0_7handlerEE_clES4_E7KernelAEE = comdat any + +$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E7KernelA = comdat any + +$_ZTSN4sycl3_V16detail19__pf_kernel_wrapperIZZ4mainENKUlRNS0_7handlerEE0_clES4_E7KernelBEE = comdat any + +$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E7KernelB = comdat any + +$_ZTSN4sycl3_V16detail19__pf_kernel_wrapperIZZ4mainENKUlRNS0_7handlerEE1_clES4_E7KernelCEE = comdat any + +$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE1_clES2_E7KernelC = comdat any + +@__spirv_BuiltInGlobalInvocationId = external dso_local local_unnamed_addr addrspace(1) constant <3 x i64>, align 32 + +; Function Attrs: norecurse +define weak_odr dso_local spir_kernel void @_ZTSN4sycl3_V16detail19__pf_kernel_wrapperIZZ4mainENKUlRNS0_7handlerEE_clES4_E7KernelAEE() local_unnamed_addr #0 comdat !srcloc !46 !kernel_arg_buffer_location !47 !intel_reqd_sub_group_size !48 !sycl_fixed_targets !49 !sycl_kernel_omit_args !50 { +entry: + %0 = load <3 x i64>, <3 x i64> addrspace(1)* @__spirv_BuiltInGlobalInvocationId, align 32, !noalias !51 + %1 = extractelement <3 x i64> %0, i64 0 + %cmp.i.i = icmp ult i64 %1, 2147483648 + tail call void @llvm.assume(i1 %cmp.i.i) + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) +declare void @llvm.assume(i1 noundef) #1 + +; Function Attrs: norecurse +define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E7KernelA() local_unnamed_addr #0 comdat !srcloc !60 !kernel_arg_buffer_location !49 !intel_reqd_sub_group_size !48 !sycl_fixed_targets !49 !sycl_kernel_omit_args !49 { +entry: + ret void +} + +; Function Attrs: norecurse +define weak_odr dso_local spir_kernel void @_ZTSN4sycl3_V16detail19__pf_kernel_wrapperIZZ4mainENKUlRNS0_7handlerEE0_clES4_E7KernelBEE() local_unnamed_addr #0 comdat !srcloc !46 !kernel_arg_buffer_location !47 !intel_reqd_sub_group_size !61 !sycl_fixed_targets !49 !sycl_kernel_omit_args !50 { +entry: + %0 = load <3 x i64>, <3 x i64> addrspace(1)* @__spirv_BuiltInGlobalInvocationId, align 32, !noalias !62 + %1 = extractelement <3 x i64> %0, i64 0 + %cmp.i.i = icmp ult i64 %1, 2147483648 + tail call void @llvm.assume(i1 %cmp.i.i) + ret void +} + +; Function Attrs: norecurse +define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_E7KernelB() local_unnamed_addr #0 comdat !srcloc !71 !kernel_arg_buffer_location !49 !intel_reqd_sub_group_size !61 !sycl_fixed_targets !49 !sycl_kernel_omit_args !49 { +entry: + ret void +} + +; Function Attrs: norecurse +define weak_odr dso_local spir_kernel void @_ZTSN4sycl3_V16detail19__pf_kernel_wrapperIZZ4mainENKUlRNS0_7handlerEE1_clES4_E7KernelCEE() local_unnamed_addr #0 comdat !srcloc !46 !kernel_arg_buffer_location !47 !intel_reqd_sub_group_size !48 !sycl_fixed_targets !49 !sycl_kernel_omit_args !50 { +entry: + %0 = load <3 x i64>, <3 x i64> addrspace(1)* @__spirv_BuiltInGlobalInvocationId, align 32, !noalias !72 + %1 = extractelement <3 x i64> %0, i64 0 + %cmp.i.i = icmp ult i64 %1, 2147483648 + tail call void @llvm.assume(i1 %cmp.i.i) + ret void +} + +; Function Attrs: norecurse +define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE1_clES2_E7KernelC() local_unnamed_addr #0 comdat !srcloc !81 !kernel_arg_buffer_location !49 !intel_reqd_sub_group_size !48 !sycl_fixed_targets !49 !sycl_kernel_omit_args !49 { +entry: + ret void +} + +attributes #0 = { norecurse "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="source.cpp" "uniform-work-group-size"="true" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) } + +!opencl.spir.version = !{!0} +!spirv.Source = !{!1} +!sycl_aspects = !{!2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42} +!llvm.ident = !{!43} +!llvm.module.flags = !{!44, !45} + +!0 = !{i32 1, i32 2} +!1 = !{i32 4, i32 100000} +!2 = !{!"host", i32 0} +!3 = !{!"cpu", i32 1} +!4 = !{!"gpu", i32 2} +!5 = !{!"accelerator", i32 3} +!6 = !{!"custom", i32 4} +!7 = !{!"fp16", i32 5} +!8 = !{!"fp64", i32 6} +!9 = !{!"image", i32 9} +!10 = !{!"online_compiler", i32 10} +!11 = !{!"online_linker", i32 11} +!12 = !{!"queue_profiling", i32 12} +!13 = !{!"usm_device_allocations", i32 13} +!14 = !{!"usm_host_allocations", i32 14} +!15 = !{!"usm_shared_allocations", i32 15} +!16 = !{!"usm_restricted_shared_allocations", i32 16} +!17 = !{!"usm_system_allocations", i32 17} +!18 = !{!"ext_intel_pci_address", i32 18} +!19 = !{!"ext_intel_gpu_eu_count", i32 19} +!20 = !{!"ext_intel_gpu_eu_simd_width", i32 20} +!21 = !{!"ext_intel_gpu_slices", i32 21} +!22 = !{!"ext_intel_gpu_subslices_per_slice", i32 22} +!23 = !{!"ext_intel_gpu_eu_count_per_subslice", i32 23} +!24 = !{!"ext_intel_max_mem_bandwidth", i32 24} +!25 = !{!"ext_intel_mem_channel", i32 25} +!26 = !{!"usm_atomic_host_allocations", i32 26} +!27 = !{!"usm_atomic_shared_allocations", i32 27} +!28 = !{!"atomic64", i32 28} +!29 = !{!"ext_intel_device_info_uuid", i32 29} +!30 = !{!"ext_oneapi_srgb", i32 30} +!31 = !{!"ext_oneapi_native_assert", i32 31} +!32 = !{!"host_debuggable", i32 32} +!33 = !{!"ext_intel_gpu_hw_threads_per_eu", i32 33} +!34 = !{!"ext_oneapi_cuda_async_barrier", i32 34} +!35 = !{!"ext_oneapi_bfloat16_math_functions", i32 35} +!36 = !{!"ext_intel_free_memory", i32 36} +!37 = !{!"ext_intel_device_id", i32 37} +!38 = !{!"ext_intel_memory_clock_rate", i32 38} +!39 = !{!"ext_intel_memory_bus_width", i32 39} +!40 = !{!"int64_base_atomics", i32 7} +!41 = !{!"int64_extended_atomics", i32 8} +!42 = !{!"usm_system_allocator", i32 17} +!43 = !{!"clang version 16.0.0"} +!44 = !{i32 1, !"wchar_size", i32 4} +!45 = !{i32 7, !"frame-pointer", i32 2} +!46 = !{i32 8347768} +!47 = !{i32 -1, i32 -1} +!48 = !{i32 16} +!49 = !{} +!50 = !{i1 true, i1 true} +!51 = !{!52, !54, !56, !58} +!52 = distinct !{!52, !53, !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv: %agg.result"} +!53 = distinct !{!53, !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv"} +!54 = distinct !{!54, !55, !"_ZN7__spirvL22initGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v: %agg.result"} +!55 = distinct !{!55, !"_ZN7__spirvL22initGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v"} +!56 = distinct !{!56, !57, !"_ZN4sycl3_V16detail7Builder7getItemILi1ELb1EEENSt9enable_ifIXT0_EKNS0_4itemIXT_EXT0_EEEE4typeEv: %agg.result"} +!57 = distinct !{!57, !"_ZN4sycl3_V16detail7Builder7getItemILi1ELb1EEENSt9enable_ifIXT0_EKNS0_4itemIXT_EXT0_EEEE4typeEv"} +!58 = distinct !{!58, !59, !"_ZN4sycl3_V16detail7Builder10getElementILi1ELb1EEEDTcl7getItemIXT_EXT0_EEEEPNS0_4itemIXT_EXT0_EEE: %agg.result"} +!59 = distinct !{!59, !"_ZN4sycl3_V16detail7Builder10getElementILi1ELb1EEEDTcl7getItemIXT_EXT0_EEEEPNS0_4itemIXT_EXT0_EEE"} +!60 = !{i32 170} +!61 = !{i32 32} +!62 = !{!63, !65, !67, !69} +!63 = distinct !{!63, !64, !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv: %agg.result"} +!64 = distinct !{!64, !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv"} +!65 = distinct !{!65, !66, !"_ZN7__spirvL22initGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v: %agg.result"} +!66 = distinct !{!66, !"_ZN7__spirvL22initGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v"} +!67 = distinct !{!67, !68, !"_ZN4sycl3_V16detail7Builder7getItemILi1ELb1EEENSt9enable_ifIXT0_EKNS0_4itemIXT_EXT0_EEEE4typeEv: %agg.result"} +!68 = distinct !{!68, !"_ZN4sycl3_V16detail7Builder7getItemILi1ELb1EEENSt9enable_ifIXT0_EKNS0_4itemIXT_EXT0_EEEE4typeEv"} +!69 = distinct !{!69, !70, !"_ZN4sycl3_V16detail7Builder10getElementILi1ELb1EEEDTcl7getItemIXT_EXT0_EEEEPNS0_4itemIXT_EXT0_EEE: %agg.result"} +!70 = distinct !{!70, !"_ZN4sycl3_V16detail7Builder10getElementILi1ELb1EEEDTcl7getItemIXT_EXT0_EEEEPNS0_4itemIXT_EXT0_EEE"} +!71 = !{i32 351} +!72 = !{!73, !75, !77, !79} +!73 = distinct !{!73, !74, !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv: %agg.result"} +!74 = distinct !{!74, !"_ZN7__spirv29InitSizesSTGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEE8initSizeEv"} +!75 = distinct !{!75, !76, !"_ZN7__spirvL22initGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v: %agg.result"} +!76 = distinct !{!76, !"_ZN7__spirvL22initGlobalInvocationIdILi1EN4sycl3_V12idILi1EEEEET0_v"} +!77 = distinct !{!77, !78, !"_ZN4sycl3_V16detail7Builder7getItemILi1ELb1EEENSt9enable_ifIXT0_EKNS0_4itemIXT_EXT0_EEEE4typeEv: %agg.result"} +!78 = distinct !{!78, !"_ZN4sycl3_V16detail7Builder7getItemILi1ELb1EEENSt9enable_ifIXT0_EKNS0_4itemIXT_EXT0_EEEE4typeEv"} +!79 = distinct !{!79, !80, !"_ZN4sycl3_V16detail7Builder10getElementILi1ELb1EEEDTcl7getItemIXT_EXT0_EEEEPNS0_4itemIXT_EXT0_EEE: %agg.result"} +!80 = distinct !{!80, !"_ZN4sycl3_V16detail7Builder10getElementILi1ELb1EEEDTcl7getItemIXT_EXT0_EEEEPNS0_4itemIXT_EXT0_EEE"} +!81 = !{i32 532} diff --git a/llvm/test/tools/sycl-post-link/device-requirements/reqd-work-group-size.ll b/llvm/test/tools/sycl-post-link/device-requirements/reqd-work-group-size.ll index 4b92ceb623630..90216e57c8f66 100644 --- a/llvm/test/tools/sycl-post-link/device-requirements/reqd-work-group-size.ll +++ b/llvm/test/tools/sycl-post-link/device-requirements/reqd-work-group-size.ll @@ -26,12 +26,10 @@ ; RUN: FileCheck %s -input-file=%t_1.prop --check-prefix CHECK-PROP-AUTO-SPLIT-1 ; CHECK-PROP-AUTO-SPLIT-0: [SYCL/device requirements] -; CHECK-PROP-AUTO-SPLIT-0-NEXT: aspects=2|AAAAAAAAAAA -; CHECK-PROP-AUTO-SPLIT-0-NEXT: reqd_work_group_size=2|gAAAAAAAAAAQAAAA +; CHECK-PROP-AUTO-SPLIT-0: reqd_work_group_size=2|gAAAAAAAAAAQAAAA ; CHECK-PROP-AUTO-SPLIT-1: [SYCL/device requirements] -; CHECK-PROP-AUTO-SPLIT-1-NEXT: aspects=2|AAAAAAAAAAA -; CHECK-PROP-AUTO-SPLIT-1-NEXT: reqd_work_group_size=2|gAAAAAAAAAAIAAAA +; CHECK-PROP-AUTO-SPLIT-1: reqd_work_group_size=2|gAAAAAAAAAAIAAAA ; ModuleID = '/tmp/source-5f7d0d.bc' source_filename = "llvm-link" diff --git a/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll b/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll index 7506268f3b75a..b31aca9bc811d 100644 --- a/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll +++ b/llvm/test/tools/sycl-post-link/sycl-esimd-large-grf.ll @@ -9,16 +9,16 @@ ; RUN: sycl-post-link -split=source -symbols -split-esimd -lower-esimd -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t.table -; RUN: FileCheck %s -input-file=%t_esimd_large_grf_0.ll --check-prefixes CHECK-ESIMD-LargeGRF-IR -; RUN: FileCheck %s -input-file=%t_esimd_large_grf_0.prop --check-prefixes CHECK-ESIMD-LargeGRF-PROP -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYCL-SYM -; RUN: FileCheck %s -input-file=%t_esimd_1.sym --check-prefixes CHECK-ESIMD-SYM -; RUN: FileCheck %s -input-file=%t_esimd_large_grf_0.sym --check-prefixes CHECK-ESIMD-LargeGRF-SYM +; RUN: FileCheck %s -input-file=%t_esimd_large_grf_1.ll --check-prefixes CHECK-ESIMD-LargeGRF-IR +; RUN: FileCheck %s -input-file=%t_esimd_large_grf_1.prop --check-prefixes CHECK-ESIMD-LargeGRF-PROP +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-SYCL-SYM +; RUN: FileCheck %s -input-file=%t_esimd_2.sym --check-prefixes CHECK-ESIMD-SYM +; RUN: FileCheck %s -input-file=%t_esimd_large_grf_1.sym --check-prefixes CHECK-ESIMD-LargeGRF-SYM ; CHECK: [Code|Properties|Symbols] ; CHECK: {{.*}}_0.ll|{{.*}}_0.prop|{{.*}}_0.sym -; CHECK: {{.*}}esimd-large-grf.ll.tmp_1.ll|{{.*}}esimd-large-grf.ll.tmp_1.prop|{{.*}}esimd-large-grf.ll.tmp_1.sym -; CHECK: {{.*}}esimd-large-grf.ll.tmp_esimd_1.ll|{{.*}}esimd-large-grf.ll.tmp_esimd_1.prop|{{.*}}esimd-large-grf.ll.tmp_esimd_1.sym +; CHECK: {{.*}}_1.ll|{{.*}}_1.prop|{{.*}}_1.sym +; CHECK: {{.*}}_2.ll|{{.*}}_2.prop|{{.*}}_2.sym ; CHECK-ESIMD-LargeGRF-PROP: isEsimdImage=1|1 ; CHECK-ESIMD-LargeGRF-PROP: isLargeGRF=1|1 diff --git a/llvm/test/tools/sycl-post-link/sycl-large-grf.ll b/llvm/test/tools/sycl-post-link/sycl-large-grf.ll index 8c140d0c3823a..c103232376abd 100644 --- a/llvm/test/tools/sycl-post-link/sycl-large-grf.ll +++ b/llvm/test/tools/sycl-post-link/sycl-large-grf.ll @@ -9,14 +9,14 @@ ; RUN: sycl-post-link -split=source -symbols -split-esimd -lower-esimd -S %s -o %t.table ; RUN: FileCheck %s -input-file=%t.table -; RUN: FileCheck %s -input-file=%t_large_grf_0.ll --check-prefixes CHECK-LARGE-GRF-IR -; RUN: FileCheck %s -input-file=%t_large_grf_0.prop --check-prefixes CHECK-LARGE-GRF-PROP -; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYCL-SYM -; RUN: FileCheck %s -input-file=%t_large_grf_0.sym --check-prefixes CHECK-LARGE-GRF-SYM +; RUN: FileCheck %s -input-file=%t_large_grf_1.ll --check-prefixes CHECK-LARGE-GRF-IR +; RUN: FileCheck %s -input-file=%t_large_grf_1.prop --check-prefixes CHECK-LARGE-GRF-PROP +; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-SYCL-SYM +; RUN: FileCheck %s -input-file=%t_large_grf_1.sym --check-prefixes CHECK-LARGE-GRF-SYM ; CHECK: [Code|Properties|Symbols] ; CHECK: {{.*}}_0.ll|{{.*}}_0.prop|{{.*}}_0.sym -; CHECK: {{.*}}-large-grf.ll.tmp_1.ll|{{.*}}-large-grf.ll.tmp_1.prop|{{.*}}-large-grf.ll.tmp_1.sym +; CHECK: {{.*}}_1.ll|{{.*}}_1.prop|{{.*}}_1.sym ; CHECK-LARGE-GRF-PROP: isLargeGRF=1|1 diff --git a/llvm/tools/sycl-post-link/ModuleSplitter.cpp b/llvm/tools/sycl-post-link/ModuleSplitter.cpp index a517d7639fffc..51d3519716bc8 100644 --- a/llvm/tools/sycl-post-link/ModuleSplitter.cpp +++ b/llvm/tools/sycl-post-link/ModuleSplitter.cpp @@ -714,7 +714,8 @@ struct UsedOptionalFeatures { SmallVector Aspects; bool UsesLargeGRF = false; SmallVector ReqdWorkGroupSize; - // TODO: extend this further with reqd-sub-group-size and other properties + int ReqdSubGroupSize = 0; + // TODO: extend this further with other properties UsedOptionalFeatures() = default; @@ -745,13 +746,24 @@ struct UsedOptionalFeatures { mdconst::extract(MDOp)->getZExtValue()); } + if (const MDNode *MDN = F->getMetadata("intel_reqd_sub_group_size")) { + size_t NumOperands = MDN->getNumOperands(); + assert(NumOperands == 1 && + "reqd_sub_group_size does not have 1 operand."); + ReqdSubGroupSize = + mdconst::extract(*(MDN->operands().begin())) + ->getZExtValue(); + } + llvm::hash_code AspectsHash = llvm::hash_combine_range(Aspects.begin(), Aspects.end()); llvm::hash_code LargeGRFHash = llvm::hash_value(UsesLargeGRF); llvm::hash_code ReqdWorkGroupSizeHash = llvm::hash_combine_range( ReqdWorkGroupSize.begin(), ReqdWorkGroupSize.end()); - Hash = static_cast( - llvm::hash_combine(AspectsHash, LargeGRFHash, ReqdWorkGroupSizeHash)); + llvm::hash_code ReqdSubGroupSizeHash = llvm::hash_value(ReqdSubGroupSize); + Hash = static_cast(llvm::hash_combine(AspectsHash, LargeGRFHash, + ReqdWorkGroupSizeHash, + ReqdSubGroupSizeHash)); } std::string generateModuleName(StringRef BaseName) const { @@ -762,6 +774,11 @@ struct UsedOptionalFeatures { Ret += "-" + std::to_string(V); } + if (ReqdSubGroupSize != 0) { + Ret += "-reqd-sub-group-size"; + Ret += "-" + std::to_string(ReqdSubGroupSize); + } + if (Aspects.empty()) return Ret + "-no-aspects"; diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp index 8d30db1a2522e..b0d715ea6b428 100644 --- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp +++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp @@ -32,7 +32,8 @@ void llvm::getSYCLDeviceRequirements( constexpr std::pair ReqdMDs[] = { {"sycl_used_aspects", "aspects"}, {"sycl_fixed_targets", "fixed_target"}, - {"reqd_work_group_size", "reqd_work_group_size"}}; + {"reqd_work_group_size", "reqd_work_group_size"}, + {"intel_reqd_sub_group_size", "reqd_sub_group_size"}}; for (const auto &MD : ReqdMDs) { std::set Values; diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 29154e33e3b27..1bb3b7b587d61 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -2196,9 +2196,7 @@ bool doesDevSupportDeviceRequirements(const device &Dev, auto AspectsPropIt = getPropIt("aspects"); auto ReqdWGSizePropIt = getPropIt("reqd_work_group_size"); - - if (!AspectsPropIt && !ReqdWGSizePropIt) - return true; + auto ReqdSubGroupSizePropIt = getPropIt("reqd_sub_group_size"); // Checking if device supports defined aspects if (AspectsPropIt) { @@ -2261,6 +2259,23 @@ bool doesDevSupportDeviceRequirements(const device &Dev, return false; } } + + // Check if device supports required sub-group size. + if (ReqdSubGroupSizePropIt) { + ByteArray ReqdSubGroupSize = + DeviceBinaryProperty(*(ReqdSubGroupSizePropIt.value())).asByteArray(); + // Drop 8 bytes describing the size of the byte array. + ReqdSubGroupSize.dropBytes(8); + int ReqdSubGroupSizeVal = 0; + if (!ReqdSubGroupSize.empty()) { + ReqdSubGroupSizeVal = ReqdSubGroupSize.consume(); + } + auto SupportedSubGroupSizes = Dev.get_info(); + if (std::find(SupportedSubGroupSizes.cbegin(), SupportedSubGroupSizes.cend(), + ReqdSubGroupSizeVal) == SupportedSubGroupSizes.cend()) { + return false; + } + } return true; } diff --git a/sycl/unittests/SYCL2020/IsCompatible.cpp b/sycl/unittests/SYCL2020/IsCompatible.cpp index 19a4d738af4f5..8dadb27616f18 100644 --- a/sycl/unittests/SYCL2020/IsCompatible.cpp +++ b/sycl/unittests/SYCL2020/IsCompatible.cpp @@ -10,6 +10,7 @@ class TestKernelCPUInvalidReqdWGSize1D; class TestKernelCPUInvalidReqdWGSize2D; class TestKernelCPUInvalidReqdWGSize3D; class TestKernelCPUValidReqdWGSize3D; +class TestKernelCPUInvalidReqSubGroupSize; class TestKernelGPU; class TestKernelACC; @@ -69,13 +70,6 @@ template <> struct KernelInfo { static constexpr int64_t getKernelSize() { return 1; } }; -} // namespace detail -} // __SYCL_INLINE_VER_NAMESPACE(_V1) -} // namespace sycl - -namespace sycl { -__SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace detail { template <> struct KernelInfo { static constexpr unsigned getNumParams() { return 0; } static const kernel_param_desc_t &getParamDesc(int) { @@ -91,13 +85,6 @@ template <> struct KernelInfo { static constexpr int64_t getKernelSize() { return 1; } }; -} // namespace detail -} // __SYCL_INLINE_VER_NAMESPACE(_V1) -} // namespace sycl - -namespace sycl { -__SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace detail { template <> struct KernelInfo { static constexpr unsigned getNumParams() { return 0; } static const kernel_param_desc_t &getParamDesc(int) { @@ -113,13 +100,21 @@ template <> struct KernelInfo { static constexpr int64_t getKernelSize() { return 1; } }; -} // namespace detail -} // __SYCL_INLINE_VER_NAMESPACE(_V1) -} // namespace sycl +template <> struct KernelInfo { + static constexpr unsigned getNumParams() { return 0; } + static const kernel_param_desc_t &getParamDesc(int) { + static kernel_param_desc_t Dummy; + return Dummy; + } + static constexpr const char *getName() { + return "TestKernelCPUInvalidReqSubGroupSize"; + } + static constexpr bool isESIMD() { return false; } + static constexpr bool callsThisItem() { return false; } + static constexpr bool callsAnyThisFreeFunction() { return false; } + static constexpr int64_t getKernelSize() { return 1; } +}; -namespace sycl { -__SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace detail { template <> struct KernelInfo { static constexpr unsigned getNumParams() { return 0; } static const kernel_param_desc_t &getParamDesc(int) { @@ -133,13 +128,6 @@ template <> struct KernelInfo { static constexpr int64_t getKernelSize() { return 1; } }; -} // namespace detail -} // __SYCL_INLINE_VER_NAMESPACE(_V1) -} // namespace sycl - -namespace sycl { -__SYCL_INLINE_VER_NAMESPACE(_V1) { -namespace detail { template <> struct KernelInfo { static constexpr unsigned getNumParams() { return 0; } static const kernel_param_desc_t &getParamDesc(int) { @@ -159,11 +147,13 @@ template <> struct KernelInfo { static sycl::unittest::PiImage generateDefaultImage(std::initializer_list KernelNames, - const std::vector &Aspects, const std::vector &ReqdWGSize = {}) { + const std::vector &Aspects, + const std::vector &ReqdWGSize = {}, + const int &ReqdSubGroupSize = 0) { using namespace sycl::unittest; PiPropertySet PropSet; - addDeviceRequirementsProps(PropSet, Aspects, ReqdWGSize); + addDeviceRequirementsProps(PropSet, Aspects, ReqdWGSize, ReqdSubGroupSize); std::vector Bin{0, 1, 2, 3, 4, 5}; // Random data @@ -180,10 +170,10 @@ generateDefaultImage(std::initializer_list KernelNames, return Img; } -static sycl::unittest::PiImage Imgs[7] = { +static sycl::unittest::PiImage Imgs[8] = { // Images for validating checks based on max_work_group_size + aspects - generateDefaultImage({"TestKernelCPU"}, {sycl::aspect::cpu}, - {32}), // 32 <= 256 (OK) + generateDefaultImage({"TestKernelCPU"}, {sycl::aspect::cpu}, {32}, + 16), // 32 <= 256 (OK), 16 is in {8, 16, 32}} generateDefaultImage({"TestKernelCPUInvalidReqdWGSize1D"}, {sycl::aspect::cpu}, {257}), // 257 > 256 (FAIL) generateDefaultImage({"TestKernelCPUInvalidReqdWGSize2D"}, @@ -195,11 +185,14 @@ static sycl::unittest::PiImage Imgs[7] = { generateDefaultImage( {"TestKernelCPUValidReqdWGSize3D"}, {sycl::aspect::cpu}, {2, 4, 5}), // 2 <= 254 (OK), 4 <= 255 (OK), 5 <= 256 (OK) + generateDefaultImage({"TestKernelCPUInvalidReqSubGroupSize"}, + {sycl::aspect::cpu}, {32}, + 256), // 256 is NOT in {8, 16, 32} (FAIL) // Images for validating checks for aspects generateDefaultImage({"TestKernelGPU"}, {sycl::aspect::gpu}), generateDefaultImage({"TestKernelACC"}, {sycl::aspect::accelerator})}; -static sycl::unittest::PiImageArray<7> ImgArray{Imgs}; +static sycl::unittest::PiImageArray<8> ImgArray{Imgs}; static pi_result redefinedDeviceGetInfoCPU(pi_device device, pi_device_info param_name, @@ -218,6 +211,17 @@ static pi_result redefinedDeviceGetInfoCPU(pi_device device, auto *Result = static_cast(param_value); *Result = 256; } + if (param_name == PI_DEVICE_INFO_SUB_GROUP_SIZES_INTEL) { + if (param_value_size_ret) { + *param_value_size_ret = 3 * sizeof(size_t); + } + if (param_value) { + auto *Result = static_cast(param_value); + Result[0] = 8; + Result[1] = 16; + Result[2] = 32; + } + } return PI_SUCCESS; } @@ -322,6 +326,16 @@ TEST(IsCompatible, CPUValidReqdWGSize3D) { EXPECT_TRUE(sycl::is_compatible(Dev)); } +TEST(IsCompatible, CPUInvalidReqSubGroupSize) { + sycl::unittest::PiMock Mock; + Mock.redefineAfter( + redefinedDeviceGetInfoCPU); + sycl::platform Plt = Mock.getPlatform(); + const sycl::device Dev = Plt.get_devices()[0]; + + EXPECT_FALSE(sycl::is_compatible(Dev)); +} + TEST(IsCompatible, GPU) { sycl::unittest::PiMock Mock; Mock.redefineAfter( diff --git a/sycl/unittests/helpers/PiImage.hpp b/sycl/unittests/helpers/PiImage.hpp index 38b06eef6d242..f092a9adc8584 100644 --- a/sycl/unittests/helpers/PiImage.hpp +++ b/sycl/unittests/helpers/PiImage.hpp @@ -505,13 +505,30 @@ inline PiProperty makeReqdWGSizeProp(const std::vector &ReqdWGSize) { return {"reqd_work_group_size", ValData, PI_PROPERTY_TYPE_BYTE_ARRAY}; } +inline PiProperty makeReqdSubGroupSizeProp(const int &ReqdSubGroupSize) { + const size_t BYTES_FOR_SIZE = 8; + std::vector ReqdSubGroupSizeVec = {ReqdSubGroupSize}; + std::vector ValData(BYTES_FOR_SIZE + sizeof(int)); + uint64_t ValDataSize = ValData.size(); + std::uninitialized_copy(&ValDataSize, &ValDataSize + sizeof(uint64_t), + ValData.data()); + auto *ReqdSubGroupSizePtr = + reinterpret_cast(&ReqdSubGroupSizeVec[0]); + std::uninitialized_copy(ReqdSubGroupSizePtr, + ReqdSubGroupSizePtr + sizeof(int), + ValData.data() + BYTES_FOR_SIZE); + return {"reqd_sub_group_size", ValData, PI_PROPERTY_TYPE_BYTE_ARRAY}; +} + inline void addDeviceRequirementsProps(PiPropertySet &Props, const std::vector &Aspects, - const std::vector &ReqdWGSize = {}) { + const std::vector &ReqdWGSize = {}, int ReqdSubGroupSize = 0) { PiArray Value{makeAspectsProp(Aspects)}; if (!ReqdWGSize.empty()) Value.push_back(makeReqdWGSizeProp(ReqdWGSize)); + if (ReqdSubGroupSize != 0) + Value.push_back(makeReqdSubGroupSizeProp(ReqdSubGroupSize)); Props.insert(__SYCL_PI_PROPERTY_SET_SYCL_DEVICE_REQUIREMENTS, std::move(Value)); }