From 1fd29f430946ef243dc1e87e908b2c0d7a26b2e0 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 24 Jan 2025 04:27:12 -0800 Subject: [PATCH 01/45] Partially revert a5e820976484 [SYCL] Remove `IsDeprecatedDeviceCopyable` (#16615) (#16744) Restore (logically) previous condition while keeping the rest of cleanups in place. Deprecation warning never worked, so I'm not even trying to implement that. --- .../sycl/detail/is_device_copyable.hpp | 21 ++++++++++++------- sycl/test/basic_tests/is_device_copyable.cpp | 10 +++++++++ .../basic_tests/is_device_copyable_neg.cpp | 6 +++--- 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/sycl/include/sycl/detail/is_device_copyable.hpp b/sycl/include/sycl/detail/is_device_copyable.hpp index bac24f4df3a11..ea036779546a6 100644 --- a/sycl/include/sycl/detail/is_device_copyable.hpp +++ b/sycl/include/sycl/detail/is_device_copyable.hpp @@ -93,20 +93,25 @@ namespace detail { template struct CheckFieldsAreDeviceCopyable; template struct CheckBasesAreDeviceCopyable; +template +inline constexpr bool is_deprecated_device_copyable_v = + is_device_copyable_v || (std::is_trivially_copy_constructible_v && + std::is_trivially_destructible_v); + template struct CheckFieldsAreDeviceCopyable> { - static_assert( - ((is_device_copyable_v && - ...)), - "The specified type is not device copyable"); + static_assert(((is_deprecated_device_copyable_v< + decltype(__builtin_field_type(T, FieldIds))> && + ...)), + "The specified type is not device copyable"); }; template struct CheckBasesAreDeviceCopyable> { - static_assert( - ((is_device_copyable_v && - ...)), - "The specified type is not device copyable"); + static_assert(((is_deprecated_device_copyable_v< + decltype(__builtin_base_type(T, BaseIds))> && + ...)), + "The specified type is not device copyable"); }; // All the captures of a lambda or functor of type FuncT passed to a kernel diff --git a/sycl/test/basic_tests/is_device_copyable.cpp b/sycl/test/basic_tests/is_device_copyable.cpp index 3e48bd5d77857..1c4199e954530 100644 --- a/sycl/test/basic_tests/is_device_copyable.cpp +++ b/sycl/test/basic_tests/is_device_copyable.cpp @@ -25,6 +25,14 @@ struct BCopyable { BCopyable(const BCopyable &x) : i(x.i) {} }; +// Not trivially copyable, but trivially copy constructible/destructible. +// Such types are passed to kernels to stay compatible with deprecated +// sycl 1.2.1 rules. +struct C : A { + const A C2; + C() : A{0}, C2{2} {} +}; + // Not copyable type, but it will be declared as device copyable. struct DCopyable { int i; @@ -59,6 +67,7 @@ void test() { A IamGood; IamGood.i = 0; BCopyable IamBadButCopyable(1); + C IamAlsoGood; DCopyable IamAlsoBadButCopyable{0}; marray MarrayForCopyableIsCopyable(0); range<2> Range{1,2}; @@ -69,6 +78,7 @@ void test() { int A = IamGood.i; int B = IamBadButCopyable.i; int C = IamAlsoBadButCopyable.i; + int D = IamAlsoGood.i; int E = MarrayForCopyableIsCopyable[0]; int F = Range[1]; int G = Id[2]; diff --git a/sycl/test/basic_tests/is_device_copyable_neg.cpp b/sycl/test/basic_tests/is_device_copyable_neg.cpp index c9007685ae693..61f88d6369aad 100644 --- a/sycl/test/basic_tests/is_device_copyable_neg.cpp +++ b/sycl/test/basic_tests/is_device_copyable_neg.cpp @@ -56,7 +56,7 @@ void test() { B IamAlsoBad{0}; marray MarrayForNotCopyable; queue Q; - // expected-error@*:* {{static assertion failed due to requirement 'is_device_copyable_v': The specified type is not device copyable}} + // expected-error@*:* {{static assertion failed due to requirement 'is_deprecated_device_copyable_v': The specified type is not device copyable}} Q.single_task([=] { int A = IamBad.i; int B = IamAlsoBad.i; @@ -64,10 +64,10 @@ void test() { }); FunctorA FA; - // expected-error@*:* {{static assertion failed due to requirement 'is_device_copyable_v': The specified type is not device copyable}} + // expected-error@*:* {{static assertion failed due to requirement 'is_deprecated_device_copyable_v': The specified type is not device copyable}} Q.single_task(FA); FunctorB FB; - // expected-error@*:* {{static assertion failed due to requirement 'is_device_copyable_v': The specified type is not device copyable}} + // expected-error@*:* {{static assertion failed due to requirement 'is_deprecated_device_copyable_v': The specified type is not device copyable}} Q.single_task(FB); } From ddc4347c8a1b861f5a59e9aedc73bda8e655b04f Mon Sep 17 00:00:00 2001 From: Kseniya Tikhomirova Date: Fri, 24 Jan 2025 14:58:10 +0100 Subject: [PATCH 02/45] [SYCL] Add missed checks of ur call result in commands.cpp (#16748) ur calls in enqueueImp should return error via return value (adapter->call_nocheck), not exception (adapter->call). ur_event_handle in corresponding event_impl also should be set only in case of successful call. Some functions have already had this logic. This PR fixes the rest. --------- Signed-off-by: Tikhomirova, Kseniya --- sycl/source/detail/scheduler/commands.cpp | 83 +++++++++++++-------- sycl/unittests/scheduler/FailedCommands.cpp | 74 ++++++++++++++++++ 2 files changed, 127 insertions(+), 30 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index ac00313e670de..005008a74ebd0 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2525,7 +2525,7 @@ static ur_result_t SetKernelParamsAndLaunch( property_list.size(), property_list.data(), RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], OutEventImpl ? &UREvent : nullptr); - if (OutEventImpl) { + if ((Error == UR_RESULT_SUCCESS) && OutEventImpl) { OutEventImpl->setHandle(UREvent); } return Error; @@ -3421,15 +3421,21 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { ur_bool_t NativeCommandSupport = false; assert(MQueue && "Native command should have an associated queue"); - MQueue->getAdapter()->call( + auto &Adapter = MQueue->getAdapter(); + Adapter->call( detail::getSyclObjImpl(MQueue->get_device())->getHandleRef(), UR_DEVICE_INFO_ENQUEUE_NATIVE_COMMAND_SUPPORT_EXP, sizeof(NativeCommandSupport), &NativeCommandSupport, nullptr); assert(NativeCommandSupport && "ext_codeplay_enqueue_native_command is not " "supported on this device"); - MQueue->getAdapter()->call( - MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData, ReqMems.size(), - ReqMems.data(), nullptr, RawEvents.size(), RawEvents.data(), Event); + if (auto Result = + Adapter->call_nocheck( + MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData, + ReqMems.size(), ReqMems.data(), nullptr, RawEvents.size(), + RawEvents.data(), Event); + Result != UR_RESULT_SUCCESS) + return Result; + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } @@ -3449,8 +3455,12 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { const AdapterPtr &Adapter = MQueue->getAdapter(); if (MEvent != nullptr) MEvent->setHostEnqueueTime(); - Adapter->call( - MQueue->getHandleRef(), &Properties, 0, nullptr, Event); + if (auto Result = + Adapter->call_nocheck( + MQueue->getHandleRef(), &Properties, 0, nullptr, Event); + Result != UR_RESULT_SUCCESS) + return Result; + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } @@ -3479,9 +3489,13 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { const AdapterPtr &Adapter = MQueue->getAdapter(); if (MEvent != nullptr) MEvent->setHostEnqueueTime(); - Adapter->call( - MQueue->getHandleRef(), &Properties, UrEvents.size(), &UrEvents[0], - Event); + if (auto Result = + Adapter->call_nocheck( + MQueue->getHandleRef(), &Properties, UrEvents.size(), + &UrEvents[0], Event); + Result != UR_RESULT_SUCCESS) + return Result; + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } @@ -3493,6 +3507,10 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { ur_event_handle_t *TimestampDeps = nullptr; size_t NumTimestampDeps = 0; + // TO DO - once the following WA removed: to change call to call_nocheck and + // return operation result to Command::enqueue (see other CG types). Set + // UREvent to EventImpl only for successful case. + // If the queue is not in-order, the implementation will need to first // insert a marker event that the timestamp waits for. ur_event_handle_t PreTimestampMarkerEvent{}; @@ -3581,15 +3599,18 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { static_cast(MCommandGroup.get()); if (MEvent != nullptr) MEvent->setHostEnqueueTime(); - ur_result_t Err = - MQueue->getAdapter() - ->call_nocheck( - CmdBufferCG->MCommandBuffer, MQueue->getHandleRef(), - RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], - Event); + if (auto Result = + MQueue->getAdapter() + ->call_nocheck( + CmdBufferCG->MCommandBuffer, MQueue->getHandleRef(), + RawEvents.size(), + RawEvents.empty() ? nullptr : &RawEvents[0], Event); + Result != UR_RESULT_SUCCESS) + return Result; + SetEventHandleOrDiscard(); - return Err; + return UR_RESULT_SUCCESS; } case CGType::CopyImage: { CGCopyImage *Copy = (CGCopyImage *)MCommandGroup.get(); @@ -3614,11 +3635,11 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { const detail::AdapterPtr &Adapter = MQueue->getAdapter(); auto OptWaitValue = SemWait->getWaitValue(); uint64_t WaitValue = OptWaitValue.has_value() ? OptWaitValue.value() : 0; - Adapter->call( - MQueue->getHandleRef(), SemWait->getExternalSemaphore(), - OptWaitValue.has_value(), WaitValue, 0, nullptr, nullptr); - return UR_RESULT_SUCCESS; + return Adapter + ->call_nocheck( + MQueue->getHandleRef(), SemWait->getExternalSemaphore(), + OptWaitValue.has_value(), WaitValue, 0, nullptr, nullptr); } case CGType::SemaphoreSignal: { assert(MQueue && @@ -3628,11 +3649,10 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { auto OptSignalValue = SemSignal->getSignalValue(); uint64_t SignalValue = OptSignalValue.has_value() ? OptSignalValue.value() : 0; - Adapter->call( - MQueue->getHandleRef(), SemSignal->getExternalSemaphore(), - OptSignalValue.has_value(), SignalValue, 0, nullptr, nullptr); - - return UR_RESULT_SUCCESS; + return Adapter + ->call_nocheck( + MQueue->getHandleRef(), SemSignal->getExternalSemaphore(), + OptSignalValue.has_value(), SignalValue, 0, nullptr, nullptr); } case CGType::None: { if (RawEvents.empty()) { @@ -3644,11 +3664,14 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { assert(MQueue && "Empty node should have an associated queue"); const detail::AdapterPtr &Adapter = MQueue->getAdapter(); ur_event_handle_t Event; - ur_result_t Result = Adapter->call_nocheck( - MQueue->getHandleRef(), RawEvents.size(), - RawEvents.size() ? &RawEvents[0] : nullptr, &Event); + if (auto Result = Adapter->call_nocheck( + MQueue->getHandleRef(), RawEvents.size(), + RawEvents.size() ? &RawEvents[0] : nullptr, &Event); + Result != UR_RESULT_SUCCESS) + return Result; + MEvent->setHandle(Event); - return Result; + return UR_RESULT_SUCCESS; } } return UR_RESULT_ERROR_INVALID_OPERATION; diff --git a/sycl/unittests/scheduler/FailedCommands.cpp b/sycl/unittests/scheduler/FailedCommands.cpp index 9273194d41344..48f0f906a0fc2 100644 --- a/sycl/unittests/scheduler/FailedCommands.cpp +++ b/sycl/unittests/scheduler/FailedCommands.cpp @@ -86,3 +86,77 @@ TEST_F(SchedulerTest, FailedCopyBackException) { &failingUrCall); RunWithFailedCommandsAndCheck(false, 1); } + +bool DummyEventReturned = false; +bool DummyEventReleaseAttempt = false; +ur_event_handle_t DummyEvent = mock::createDummyHandle(); + +inline ur_result_t failedEnqueueKernelLaunchWithDummy(void *pParams) { + DummyEventReturned = true; + auto params = *static_cast(pParams); + **params.pphEvent = DummyEvent; + return UR_RESULT_ERROR_UNKNOWN; +} + +inline ur_result_t checkDummyInEventRelease(void *pParams) { + auto params = static_cast(pParams); + DummyEventReleaseAttempt = params == DummyEvent; + return UR_RESULT_SUCCESS; +} + +inline ur_result_t failedEnqueueBarrierWithDummy(void *pParams) { + DummyEventReturned = true; + auto params = + *static_cast(pParams); + **params.pphEvent = DummyEvent; + return UR_RESULT_ERROR_UNKNOWN; +} + +// Checks that in case of failed command and "valid" event assigned to output +// event var, RT ignores it and do not call release since its usage is undefined +// behavior. +TEST(FailedCommandsTest, CheckUREventReleaseWithKernel) { + DummyEventReleaseAttempt = false; + DummyEventReturned = false; + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueKernelLaunch", + &failedEnqueueKernelLaunchWithDummy); + mock::getCallbacks().set_before_callback("urEventRelease", + &checkDummyInEventRelease); + platform Plt = sycl::platform(); + queue Queue(context(Plt), default_selector_v); + { + try { + Queue.submit( + [&](sycl::handler &CGH) { CGH.single_task>([]() {}); }); + } catch (...) { + } + } + Queue.wait(); + ASSERT_TRUE(DummyEventReturned); + ASSERT_FALSE(DummyEventReleaseAttempt); +} + +// Checks that in case of failed command and "valid" event assigned to output +// event var, RT ignores it and do not call release since its usage is undefined +// behavior. +TEST(FailedCommandsTest, CheckUREventReleaseWithBarrier) { + DummyEventReleaseAttempt = false; + DummyEventReturned = false; + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback("urEnqueueEventsWaitWithBarrierExt", + &failedEnqueueBarrierWithDummy); + mock::getCallbacks().set_before_callback("urEventRelease", + &checkDummyInEventRelease); + platform Plt = sycl::platform(); + queue Queue(context(Plt), default_selector_v); + { + try { + Queue.submit([&](sycl::handler &CGH) { CGH.ext_oneapi_barrier(); }); + } catch (...) { + } + } + Queue.wait(); + ASSERT_TRUE(DummyEventReturned); + ASSERT_FALSE(DummyEventReleaseAttempt); +} From 02d2e34c1c83de00bab568d45675beb355c9d856 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mestre?= Date: Fri, 24 Jan 2025 13:58:26 +0000 Subject: [PATCH 03/45] [UR] Fix kernel arguments being overwritten in the CUDA and HIP adapters (#16733) https://github.com/oneapi-src/unified-runtime/pull/2559 --------- Co-authored-by: Kenneth Benzie (Benie) --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 3fee63268d37e..5ba8a6aa938f1 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit c6859445e01d433ec1cf3d87a244c5cf697fb290 -# Merge: d3e97040 b1b0c60c +# commit b841691699393dd2375e987c3d38d5f59c3e35cf +# Merge: c6859445 9de10cd9 # Author: Kenneth Benzie (Benie) -# Date: Thu Jan 23 13:59:15 2025 +0000 -# Merge pull request #2589 from Bensuo/fabio/fix_potential_race_condition -# Fix potential deadlock in the WaitEvent path of CmdBuffers -set(UNIFIED_RUNTIME_TAG c6859445e01d433ec1cf3d87a244c5cf697fb290) +# Date: Thu Jan 23 16:07:06 2025 +0000 +# Merge pull request #2559 from Bensuo/fix_kernel_arg_indices +# [CUDA][HIP] Fix kernel arguments being overwritten when added out of order +set(UNIFIED_RUNTIME_TAG b841691699393dd2375e987c3d38d5f59c3e35cf) From 0885da8345e5b825c1dbbee0d783ffd8902f0cb0 Mon Sep 17 00:00:00 2001 From: Nikita Kornev Date: Fri, 24 Jan 2025 16:09:56 +0100 Subject: [PATCH 04/45] [CI][CTS] Update cts_exclude_filter_L0_GPU (#16762) The issue was resolved. --- devops/cts_exclude_filter_L0_GPU | 2 -- 1 file changed, 2 deletions(-) diff --git a/devops/cts_exclude_filter_L0_GPU b/devops/cts_exclude_filter_L0_GPU index fa45b9ac1e5cf..44d3870b88048 100644 --- a/devops/cts_exclude_filter_L0_GPU +++ b/devops/cts_exclude_filter_L0_GPU @@ -1,4 +1,2 @@ # Please use "#" to add comments here. # Do not delete the file even if it's empty. -# CMPLRTST-26179 -device From 5f69e3e48c87000782c12588a3b53ea909a1d42c Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 24 Jan 2025 09:03:30 -0800 Subject: [PATCH 05/45] [CI] Prebuild E2E tests as part of build workflow (#16682) This will allow us to save time on runner setup and toolchain download/unpacking. --- .github/workflows/sycl-linux-build.yml | 38 ++++++++++++++++++++-- .github/workflows/sycl-linux-precommit.yml | 19 ++--------- devops/actions/run-tests/e2e/action.yml | 4 ++- 3 files changed, 41 insertions(+), 20 deletions(-) diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml index 59f4aa99e4e42..d528aa6ce7ce3 100644 --- a/.github/workflows/sycl-linux-build.yml +++ b/.github/workflows/sycl-linux-build.yml @@ -14,7 +14,7 @@ on: build_image: type: string required: false - default: "ghcr.io/intel/llvm/ubuntu2404_build:latest" + default: "ghcr.io/intel/llvm/ubuntu2404_intel_drivers:alldeps" build_ref: type: string required: false @@ -70,7 +70,7 @@ on: build_image: type: choice options: - - "ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:build" + - 'ghcr.io/intel/llvm/sycl_ubuntu2404_nightly:latest' cc: type: choice options: @@ -252,3 +252,37 @@ jobs: name: sycl_linux_${{ inputs.build_artifact_suffix }} path: ${{ steps.artifact_info.outputs.ARCHIVE_NAME }} retention-days: ${{ inputs.retention-days }} + + - name: Copy toolchain + if: ${{ always() && !cancelled() && steps.build.conclusion == 'success' }} + # We must have the compiler in the same location as it will be in the E2E + # run-tests job. + run: cp -r $GITHUB_WORKSPACE/build/install $GITHUB_WORKSPACE/toolchain + + - name: Source OneAPI TBB vars.sh + shell: bash + run: | + # https://github.com/actions/runner/issues/1964 prevents us from using + # the ENTRYPOINT in the image. + env | sort > env_before + if [ -e /runtimes/oneapi-tbb/env/vars.sh ]; then + source /runtimes/oneapi-tbb/env/vars.sh; + elif [ -e /opt/runtimes/oneapi-tbb/env/vars.sh ]; then + source /opt/runtimes/oneapi-tbb/env/vars.sh; + else + echo "no TBB vars in /opt/runtimes or /runtimes"; + fi + env | sort > env_after + comm -13 env_before env_after >> $GITHUB_ENV + rm env_before env_after + + - name: Build E2E tests + if: ${{ always() && !cancelled() && steps.build.conclusion == 'success' }} + uses: ./devops/actions/run-tests/e2e + with: + ref: ${{ inputs.ref || github.sha }} + merge_ref: ${{ inputs.merge_ref }} + e2e_testing_mode: build-only + target_devices: all + artifact_suffix: default + cxx_compiler: $GITHUB_WORKSPACE/toolchain/bin/clang++ diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index ced362ce178ea..813bc04f2a695 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -46,7 +46,7 @@ jobs: build_artifact_suffix: "default" build_cache_suffix: "default" # Docker image has last nightly pre-installed and added to the PATH - build_image: "ghcr.io/intel/llvm/sycl_ubuntu2204_nightly:build" + build_image: "ghcr.io/intel/llvm/sycl_ubuntu2404_nightly:latest" cc: clang cxx: clang++ changes: ${{ needs.detect_changes.outputs.filters }} @@ -73,23 +73,8 @@ jobs: echo 'arc_tests="Matrix/"' >> "$GITHUB_OUTPUT" fi - build_e2e_tests: - needs: [build] - if: ${{ always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' }} - uses: ./.github/workflows/sycl-linux-run-tests.yml - with: - name: Build e2e tests - runner: '["Linux", "build"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:alldeps - image_options: -u 1001 - ref: ${{ github.sha }} - merge_ref: '' - sycl_toolchain_artifact: sycl_linux_default - sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} - sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }} - e2e_testing_mode: 'build-only' run_prebuilt_e2e_tests: - needs: [build, build_e2e_tests] + needs: [build] if: ${{ always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' }} strategy: fail-fast: false diff --git a/devops/actions/run-tests/e2e/action.yml b/devops/actions/run-tests/e2e/action.yml index bc0e6393757dc..414f88d08c058 100644 --- a/devops/actions/run-tests/e2e/action.yml +++ b/devops/actions/run-tests/e2e/action.yml @@ -19,6 +19,8 @@ inputs: required: false retention-days: required: false + cxx_compiler: + required: false runs: @@ -58,7 +60,7 @@ runs: if: inputs.e2e_binaries_artifact == '' shell: bash run: | - cmake -GNinja -B./build-e2e -S./llvm/sycl/test-e2e -DCMAKE_CXX_COMPILER="$(which clang++)" -DLLVM_LIT="$PWD/llvm/llvm/utils/lit/lit.py" ${{ steps.cmake_opts.outputs.opts }} + cmake -GNinja -B./build-e2e -S./llvm/sycl/test-e2e -DCMAKE_CXX_COMPILER="${{ inputs.cxx_compiler || '$(which clang++)'}}" -DLLVM_LIT="$PWD/llvm/llvm/utils/lit/lit.py" ${{ steps.cmake_opts.outputs.opts }} - name: SYCL End-to-end tests shell: bash {0} env: From 2307220616221996a64a17662377ebbe232f33fd Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Sat, 25 Jan 2025 03:25:40 +0900 Subject: [PATCH 06/45] [CI] Fix Windows oneAPI test failures (#16758) It was the math flags. We can't use `/Qno-intel-libs` and remove the env setup requirement from the e2e phase because of some insane linking errors I can't figure out and I don't think it's worth the time to investigate it any more. Closes: https://github.com/intel/llvm/issues/16362 Signed-off-by: Sarnie, Nick --- .github/workflows/sycl-post-commit.yml | 2 +- .github/workflows/sycl-windows-build.yml | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/.github/workflows/sycl-post-commit.yml b/.github/workflows/sycl-post-commit.yml index d0d5909f68228..ec90056d8ed00 100644 --- a/.github/workflows/sycl-post-commit.yml +++ b/.github/workflows/sycl-post-commit.yml @@ -104,7 +104,7 @@ jobs: uses: ./.github/workflows/sycl-windows-build.yml with: compiler: icx - build_configure_extra_args: --cmake-opt=-DCMAKE_C_FLAGS="/clang:-Wno-nonportable-include-path /clang:-Wno-cast-function-type-mismatch" --cmake-opt=-DCMAKE_CXX_FLAGS="/clang:-Wno-nonportable-include-path /clang:-Wno-cast-function-type-mismatch" --cmake-opt="-DCMAKE_EXE_LINKER_FLAGS=/manifest:no" --cmake-opt="-DCMAKE_MODULE_LINKER_FLAGS=/manifest:no" --cmake-opt="-DCMAKE_SHARED_LINKER_FLAGS=/manifest:no" + build_configure_extra_args: --cmake-opt=-DCMAKE_C_FLAGS="/fp:precise /clang:-Wno-nonportable-include-path /clang:-Wno-cast-function-type-mismatch" --cmake-opt=-DCMAKE_CXX_FLAGS="/fp:precise /clang:-Wno-nonportable-include-path /clang:-Wno-cast-function-type-mismatch" --cmake-opt="-DCMAKE_EXE_LINKER_FLAGS=/manifest:no" --cmake-opt="-DCMAKE_MODULE_LINKER_FLAGS=/manifest:no" --cmake-opt="-DCMAKE_SHARED_LINKER_FLAGS=/manifest:no" build_cache_suffix: icx merge_ref: '' diff --git a/.github/workflows/sycl-windows-build.yml b/.github/workflows/sycl-windows-build.yml index c7349331f8f6f..9a19cf677e07e 100644 --- a/.github/workflows/sycl-windows-build.yml +++ b/.github/workflows/sycl-windows-build.yml @@ -135,10 +135,7 @@ jobs: - name: check-llvm if: always() && !cancelled() && contains(inputs.changes, 'llvm') shell: bash - run: | - if [[ ${{inputs.compiler}} == 'icx' ]]; then - export LIT_FILTER="SYCL" - fi + run: | cmake --build build --target check-llvm - name: check-clang if: always() && !cancelled() && contains(inputs.changes, 'clang') @@ -147,10 +144,7 @@ jobs: - name: check-sycl if: always() && !cancelled() && contains(inputs.changes, 'sycl') shell: bash - run: | - if [[ ${{inputs.compiler}} == 'icx' ]]; then - export LIT_XFAIL="regression\host_tanpi_double_accuracy.cpp" - fi + run: | cmake --build build --target check-sycl - name: check-sycl-unittests if: always() && !cancelled() && contains(inputs.changes, 'sycl') From 4d3d4e60d3f5868050f0aadbc4fca889df6986f2 Mon Sep 17 00:00:00 2001 From: Nikita Kornev Date: Fri, 24 Jan 2025 20:05:44 +0100 Subject: [PATCH 07/45] [CI] New workflow for SYCL-CTS with SPIR-V backend (#16731) This patch adds new workflow to build SYCL-CTS with SPIR-V backend and run it on gen12 OCL CPU and PVC L0 GPU. It will be launched only on Sundays. --- .github/workflows/sycl-weekly.yml | 72 +++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 .github/workflows/sycl-weekly.yml diff --git a/.github/workflows/sycl-weekly.yml b/.github/workflows/sycl-weekly.yml new file mode 100644 index 0000000000000..d6dcf3114695e --- /dev/null +++ b/.github/workflows/sycl-weekly.yml @@ -0,0 +1,72 @@ +# This workflow builds SYCL-CTS with -fsycl-use-spirv-backend-for-spirv-gen and +# runs it with opencl:cpu & gen12 and level_zero:gpu & PVC on Sundays. + +name: SYCL-CTS with SPIR-V backend + +on: + workflow_dispatch: + schedule: + - cron: '0 0 * * 0' + +permissions: read-all + +jobs: + ubuntu2204_build: + if: github.repository == 'intel/llvm' + uses: ./.github/workflows/sycl-linux-build.yml + secrets: inherit + with: + build_cache_root: "/__w/" + build_artifact_suffix: default + build_configure_extra_args: '' + merge_ref: '' + + build-sycl-cts: + needs: ubuntu2204_build + if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }} + uses: ./.github/workflows/sycl-linux-run-tests.yml + with: + name: Build SYCL-CTS + runner: '["Linux", "build"]' + cts_testing_mode: 'build-only' + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest + image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN + tests_selector: cts + ref: ${{ github.sha }} + sycl_toolchain_artifact: sycl_linux_default + sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }} + sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }} + extra_cmake_args: -DDPCPP_FLAGS=-fsycl-use-spirv-backend-for-spirv-gen + + run-sycl-cts: + needs: [ubuntu2204_build, build-sycl-cts] + if: ${{ always() && !cancelled() && needs.ubuntu2204_build.outputs.build_conclusion == 'success' }} + strategy: + fail-fast: false + matrix: + include: + - name: SYCL-CTS on OCL CPU PVC w/ LLVM SPIR-V Backend + runner: '["Linux", "pvc"]' + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest + image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN + target_devices: opencl:cpu + + - name: SYCL-CTS on L0 GPU PVC w/ LLVM SPIR-V Backend + runner: '["Linux", "pvc"]' + image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest + image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN + target_devices: level_zero:gpu + uses: ./.github/workflows/sycl-linux-run-tests.yml + with: + name: ${{ matrix.name }} + runner: ${{ matrix.runner }} + cts_testing_mode: 'run-only' + image: ${{ matrix.image }} + image_options: ${{ matrix.image_options }} + target_devices: ${{ matrix.target_devices }} + tests_selector: cts + ref: ${{ github.sha }} + sycl_toolchain_artifact: sycl_linux_default + sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }} + sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }} + sycl_cts_artifact: sycl_cts_bin From 5edcf744a3d76baf3277fc08d54e0a90b5e09a66 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 24 Jan 2025 11:36:25 -0800 Subject: [PATCH 08/45] [SYCL] Enable mapping of group load/store functions to SPIRV built-ins for local address space (#16653) Extension: https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_local_block_io.html Currently these built-ins for local address space are not supported by cpu/fpga backends, so introduce undocumented `native_local_block_io` property which allows to enable mapping to those built-ins. If this property is not provided then implementation falls back to naive approach. --- sycl/include/sycl/__spirv/spirv_ops.hpp | 41 + .../oneapi/experimental/group_load_store.hpp | 112 +- .../sycl/ext/oneapi/properties/property.hpp | 3 +- .../GroupAlgorithm/load_store/basic.cpp | 79 +- .../load_store/conversions_load.cpp | 24 +- .../load_store/conversions_store.cpp | 24 +- .../load_store/odd_sized_type.cpp | 60 +- .../GroupAlgorithm/load_store/odd_wg_size.cpp | 60 +- .../GroupAlgorithm/load_store/partial_sg.cpp | 66 +- sycl/test/check_device_code/group_load.cpp | 1644 ++++++++----- .../group_load_store_native_key.cpp | 179 ++ sycl/test/check_device_code/group_store.cpp | 2064 +++++++++++------ 12 files changed, 2996 insertions(+), 1360 deletions(-) create mode 100644 sycl/test/check_device_code/group_load_store_native_key.cpp diff --git a/sycl/include/sycl/__spirv/spirv_ops.hpp b/sycl/include/sycl/__spirv/spirv_ops.hpp index 16d90aad9617b..5800190f539a0 100644 --- a/sycl/include/sycl/__spirv/spirv_ops.hpp +++ b/sycl/include/sycl/__spirv/spirv_ops.hpp @@ -445,6 +445,47 @@ template __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void __spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_global)) uint64_t *Ptr, dataT Data) noexcept; + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT +__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local)) + uint8_t *Ptr) noexcept; + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void +__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint8_t *Ptr, + dataT Data) noexcept; + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT +__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local)) + uint16_t *Ptr) noexcept; + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void +__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint16_t *Ptr, + dataT Data) noexcept; + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT +__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local)) + uint32_t *Ptr) noexcept; + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void +__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint32_t *Ptr, + dataT Data) noexcept; + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL dataT +__spirv_SubgroupBlockReadINTEL(const __attribute__((opencl_local)) + uint64_t *Ptr) noexcept; + +template +__SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL void +__spirv_SubgroupBlockWriteINTEL(__attribute__((opencl_local)) uint64_t *Ptr, + dataT Data) noexcept; + template extern __DPCPP_SYCL_EXTERNAL sycl::detail::ap_int __spirv_FixedSqrtINTEL(sycl::detail::ap_int a, bool S, int32_t I, int32_t rI, diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp index 694bf2a5eb302..fc196ee134f6b 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/group_load_store.hpp @@ -58,6 +58,13 @@ struct naive_key : detail::compile_time_property_key { using value_t = property_value; }; inline constexpr naive_key::value_t naive; + +struct native_local_block_io_key + : detail::compile_time_property_key { + using value_t = property_value; +}; +inline constexpr native_local_block_io_key::value_t native_local_block_io; + using namespace sycl::detail; } // namespace detail @@ -154,7 +161,6 @@ template struct BlockTypeInfo; template struct BlockTypeInfo> { using BlockInfoTy = BlockInfo; - static_assert(BlockInfoTy::has_builtin); using block_type = detail::fixed_width_unsigned; @@ -163,15 +169,23 @@ struct BlockTypeInfo> { typename std::iterator_traits::reference>>, std::add_const_t, block_type>; - using block_pointer_type = typename detail::DecoratedType< - block_pointer_elem_type, access::address_space::global_space>::type *; + static constexpr auto deduced_address_space = + detail::deduce_AS>::value; + + using block_pointer_type = + typename detail::DecoratedType::type *; + using block_op_type = std::conditional_t< BlockInfoTy::num_blocks == 1, block_type, detail::ConvertToOpenCLType_t>>; }; -// Returns either a pointer suitable to use in a block read/write builtin or -// nullptr if some legality conditions aren't satisfied. +// Returns either a pointer decorated with the deduced address space, suitable +// to use in a block read/write builtin, or nullptr if some legality conditions +// aren't satisfied. If deduced address space is generic then returned pointer +// will have generic address space and has to be dynamically casted to global or +// local space before using in a builtin. template auto get_block_op_ptr(IteratorT iter, [[maybe_unused]] Properties props) { @@ -211,16 +225,17 @@ auto get_block_op_ptr(IteratorT iter, [[maybe_unused]] Properties props) { bool is_aligned = alignof(value_type) >= RequiredAlign || reinterpret_cast(iter) % RequiredAlign == 0; - constexpr auto AS = detail::deduce_AS::value; using block_pointer_type = typename BlockTypeInfo::block_pointer_type; - if constexpr (AS == access::address_space::global_space) { + + static constexpr auto deduced_address_space = + BlockTypeInfo::deduced_address_space; + if constexpr (deduced_address_space == + access::address_space::generic_space || + deduced_address_space == + access::address_space::global_space || + deduced_address_space == access::address_space::local_space) { return is_aligned ? reinterpret_cast(iter) : nullptr; - } else if constexpr (AS == access::address_space::generic_space) { - return is_aligned ? reinterpret_cast( - detail::dynamic_address_cast< - access::address_space::global_space>(iter)) - : nullptr; } else { return nullptr; } @@ -261,11 +276,37 @@ group_load(Group g, InputIteratorT in_ptr, // Do optimized load. using value_type = remove_decoration_t< typename std::iterator_traits::value_type>; - - auto load = __spirv_SubgroupBlockReadINTEL< - typename detail::BlockTypeInfo>::block_op_type>( - ptr); + using block_info = typename detail::BlockTypeInfo< + detail::BlockInfo>; + static constexpr auto deduced_address_space = + block_info::deduced_address_space; + using block_op_type = typename block_info::block_op_type; + + if constexpr (deduced_address_space == + access::address_space::local_space && + !props.template has_property< + detail::native_local_block_io_key>()) + return group_load(g, in_ptr, out, use_naive{}); + + block_op_type load; + if constexpr (deduced_address_space == + access::address_space::generic_space) { + if (auto local_ptr = detail::dynamic_address_cast< + access::address_space::local_space>(ptr)) { + if constexpr (props.template has_property< + detail::native_local_block_io_key>()) + load = __spirv_SubgroupBlockReadINTEL(local_ptr); + else + return group_load(g, in_ptr, out, use_naive{}); + } else if (auto global_ptr = detail::dynamic_address_cast< + access::address_space::global_space>(ptr)) { + load = __spirv_SubgroupBlockReadINTEL(global_ptr); + } else { + return group_load(g, in_ptr, out, use_naive{}); + } + } else { + load = __spirv_SubgroupBlockReadINTEL(ptr); + } // TODO: accessor_iterator's value_type is weird, so we need // `std::remove_const_t` below: @@ -331,6 +372,16 @@ group_store(Group g, const span in, return group_store(g, in, out_ptr, use_naive{}); if constexpr (!std::is_same_v) { + using block_info = typename detail::BlockTypeInfo< + detail::BlockInfo>; + static constexpr auto deduced_address_space = + block_info::deduced_address_space; + if constexpr (deduced_address_space == + access::address_space::local_space && + !props.template has_property< + detail::native_local_block_io_key>()) + return group_store(g, in, out_ptr, use_naive{}); + // Do optimized store. std::remove_const_t::value_type>> @@ -341,11 +392,28 @@ group_store(Group g, const span in, values[i] = in[i]; } - __spirv_SubgroupBlockWriteINTEL( - ptr, - sycl::bit_cast>::block_op_type>( - values)); + using block_op_type = typename block_info::block_op_type; + if constexpr (deduced_address_space == + access::address_space::generic_space) { + if (auto local_ptr = detail::dynamic_address_cast< + access::address_space::local_space>(ptr)) { + if constexpr (props.template has_property< + detail::native_local_block_io_key>()) + __spirv_SubgroupBlockWriteINTEL( + local_ptr, sycl::bit_cast(values)); + else + return group_store(g, in, out_ptr, use_naive{}); + } else if (auto global_ptr = detail::dynamic_address_cast< + access::address_space::global_space>(ptr)) { + __spirv_SubgroupBlockWriteINTEL( + global_ptr, sycl::bit_cast(values)); + } else { + return group_store(g, in, out_ptr, use_naive{}); + } + } else { + __spirv_SubgroupBlockWriteINTEL(ptr, + sycl::bit_cast(values)); + } } } } diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp index 715a3e8b83252..8de51110e7089 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp @@ -224,8 +224,9 @@ enum PropKind : uint32_t { WorkGroupScratchSize = 79, Restrict = 80, EventMode = 81, + NativeLocalBlockIO = 82, // PropKindSize must always be the last value. - PropKindSize = 82, + PropKindSize = 83, }; template struct PropertyToKind { diff --git a/sycl/test-e2e/GroupAlgorithm/load_store/basic.cpp b/sycl/test-e2e/GroupAlgorithm/load_store/basic.cpp index 1028da48f6051..e64a466548597 100644 --- a/sycl/test-e2e/GroupAlgorithm/load_store/basic.cpp +++ b/sycl/test-e2e/GroupAlgorithm/load_store/basic.cpp @@ -6,8 +6,9 @@ #include -int main() { - using namespace sycl; +using namespace sycl; + +template int test(queue &q) { namespace sycl_exp = sycl::ext::oneapi::experimental; constexpr std::size_t wg_size = 32; @@ -16,8 +17,6 @@ int main() { constexpr std::size_t elems_per_wi = 4; constexpr std::size_t n = global_size * elems_per_wi; - queue q; - buffer input_buf{n}; { @@ -42,8 +41,10 @@ int main() { accessor store_blocked{store_blocked_buf, cgh}; accessor store_striped{store_striped_buf, cgh}; + local_accessor local_acc{wg_size * elems_per_wi, cgh}; cgh.parallel_for(nd_range<1>{global_size, wg_size}, [=](nd_item<1> ndi) { auto gid = ndi.get_global_id(0); + auto lid = ndi.get_local_id(0); auto g = ndi.get_group(); auto offset = g.get_group_id(0) * g.get_local_range(0) * elems_per_wi; @@ -52,18 +53,39 @@ int main() { auto blocked = sycl_exp::properties{sycl_exp::data_placement_blocked}; auto striped = sycl_exp::properties{sycl_exp::data_placement_striped}; + if constexpr (addr_space == access::address_space::local_space) { + // Copy input to local memory. + for (int i = lid * elems_per_wi; i < lid * elems_per_wi + elems_per_wi; + i++) { + local_acc[i] = input[offset + i]; + } + ndi.barrier(access::fence_space::local_space); + } + // default - sycl_exp::group_load(g, input.begin() + offset, span{data}); + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_load(g, local_acc.begin(), span{data}); + } else { + sycl_exp::group_load(g, input.begin() + offset, span{data}); + } for (int i = 0; i < elems_per_wi; ++i) load_blocked_default[gid * elems_per_wi + i] = data[i]; // blocked - sycl_exp::group_load(g, input.begin() + offset, span{data}, blocked); + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_load(g, local_acc.begin(), span{data}, blocked); + } else { + sycl_exp::group_load(g, input.begin() + offset, span{data}, blocked); + } for (int i = 0; i < elems_per_wi; ++i) load_blocked[gid * elems_per_wi + i] = data[i]; // striped - sycl_exp::group_load(g, input.begin() + offset, span{data}, striped); + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_load(g, local_acc.begin(), span{data}, striped); + } else { + sycl_exp::group_load(g, input.begin() + offset, span{data}, striped); + } for (int i = 0; i < elems_per_wi; ++i) load_striped[gid * elems_per_wi + i] = data[i]; @@ -71,12 +93,36 @@ int main() { std::iota(std::begin(data), std::end(data), gid * elems_per_wi); - sycl_exp::group_store(g, span{data}, - store_blocked_default.begin() + offset); - sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset, - blocked); - sycl_exp::group_store(g, span{data}, store_striped.begin() + offset, - striped); + auto copy_local_acc_to_global_output = [&](accessor output) { + for (int i = lid * elems_per_wi; i < lid * elems_per_wi + elems_per_wi; + i++) { + output[offset + i] = local_acc[i]; + } + }; + + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_store(g, span{data}, local_acc.begin()); + copy_local_acc_to_global_output(store_blocked_default); + } else { + sycl_exp::group_store(g, span{data}, + store_blocked_default.begin() + offset); + } + + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_store(g, span{data}, local_acc.begin(), blocked); + copy_local_acc_to_global_output(store_blocked); + } else { + sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset, + blocked); + } + + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_store(g, span{data}, local_acc.begin(), striped); + copy_local_acc_to_global_output(store_striped); + } else { + sycl_exp::group_store(g, span{data}, store_striped.begin() + offset, + striped); + } }); }); @@ -111,3 +157,10 @@ int main() { return 0; } + +int main() { + queue q; + test(q); + test(q); + return 0; +} diff --git a/sycl/test-e2e/GroupAlgorithm/load_store/conversions_load.cpp b/sycl/test-e2e/GroupAlgorithm/load_store/conversions_load.cpp index fe06a5c2a93e7..c569ad01d6ce2 100644 --- a/sycl/test-e2e/GroupAlgorithm/load_store/conversions_load.cpp +++ b/sycl/test-e2e/GroupAlgorithm/load_store/conversions_load.cpp @@ -13,14 +13,12 @@ struct S { int i; }; -int main() { +template int test(sycl::queue &q) { using namespace sycl; namespace sycl_exp = sycl::ext::oneapi::experimental; constexpr std::size_t wg_size = 16; - queue q; - buffer input_buf{wg_size * 2}; { host_accessor acc{input_buf}; @@ -31,12 +29,23 @@ int main() { q.submit([&](handler &cgh) { accessor input{input_buf, cgh}; accessor success{success_buf, cgh}; + local_accessor local_acc{wg_size * 2, cgh}; cgh.parallel_for(nd_range<1>{wg_size, wg_size}, [=](nd_item<1> ndi) { auto gid = ndi.get_global_id(0); + auto lid = ndi.get_local_id(0); auto g = ndi.get_group(); S data[2]; - sycl_exp::group_load(g, input.begin(), span{data}); + + if constexpr (addr_space == access::address_space::local_space) { + for (int i = lid * 2; i < lid * 2 + 2; i++) { + local_acc[i] = input[i]; + } + ndi.barrier(access::fence_space::local_space); + sycl_exp::group_load(g, local_acc.begin(), span{data}); + } else { + sycl_exp::group_load(g, input.begin(), span{data}); + } bool ok = true; ok &= (data[0].i == gid * 2 + 0 + 42); @@ -50,3 +59,10 @@ int main() { return 0; } + +int main() { + sycl::queue q; + test(q); + test(q); + return 0; +} diff --git a/sycl/test-e2e/GroupAlgorithm/load_store/conversions_store.cpp b/sycl/test-e2e/GroupAlgorithm/load_store/conversions_store.cpp index de55f6bc1f289..7d6460b7ba5ff 100644 --- a/sycl/test-e2e/GroupAlgorithm/load_store/conversions_store.cpp +++ b/sycl/test-e2e/GroupAlgorithm/load_store/conversions_store.cpp @@ -11,26 +11,35 @@ struct S { }; static_assert(std::is_trivially_copyable_v); -int main() { +template int test(sycl::queue &q) { using namespace sycl; namespace sycl_exp = sycl::ext::oneapi::experimental; constexpr std::size_t wg_size = 16; - queue q; - buffer output_buf{wg_size * 2}; q.submit([&](handler &cgh) { accessor output{output_buf, cgh}; + local_accessor local_acc{wg_size * 2, cgh}; cgh.parallel_for(nd_range<1>{wg_size, wg_size}, [=](nd_item<1> ndi) { auto gid = ndi.get_global_id(0); + auto lid = ndi.get_local_id(0); auto g = ndi.get_group(); S data[2]; data[0].i = gid * 2 + 0; data[1].i = gid * 2 + 1; - sycl_exp::group_store(g, span{data}, output.begin()); + + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_store(g, span{data}, local_acc.begin()); + ndi.barrier(access::fence_space::local_space); + for (int i = lid * 2; i < lid * 2 + 2; i++) { + output[i] = local_acc[i]; + } + } else { + sycl_exp::group_store(g, span{data}, output.begin()); + } }); }); @@ -41,3 +50,10 @@ int main() { return 0; } + +int main() { + sycl::queue q; + test(q); + test(q); + return 0; +} diff --git a/sycl/test-e2e/GroupAlgorithm/load_store/odd_sized_type.cpp b/sycl/test-e2e/GroupAlgorithm/load_store/odd_sized_type.cpp index 98cdc1eb9f47a..d5e05b00bf74f 100644 --- a/sycl/test-e2e/GroupAlgorithm/load_store/odd_sized_type.cpp +++ b/sycl/test-e2e/GroupAlgorithm/load_store/odd_sized_type.cpp @@ -30,14 +30,11 @@ struct __attribute__((packed)) S { static_assert(sizeof(S) == 7); static_assert(std::is_trivially_copyable_v); -int main() { - +template int test(queue &q) { constexpr std::size_t wg_size = 32; constexpr std::size_t elems_per_wi = 2; constexpr std::size_t n = wg_size * elems_per_wi; - queue q; - buffer input_buf{n}; { @@ -58,8 +55,11 @@ int main() { accessor store_blocked{store_blocked_buf, cgh}; accessor store_striped{store_striped_buf, cgh}; + local_accessor local_acc{wg_size * elems_per_wi, cgh}; + cgh.parallel_for(nd_range<1>{wg_size, wg_size}, [=](nd_item<1> ndi) { auto gid = ndi.get_global_id(0); + auto lid = ndi.get_local_id(0); auto g = ndi.get_group(); S data[elems_per_wi]; @@ -67,22 +67,55 @@ int main() { auto blocked = sycl_exp::properties{sycl_exp::data_placement_blocked}; auto striped = sycl_exp::properties{sycl_exp::data_placement_striped}; + if constexpr (addr_space == access::address_space::local_space) { + // Copy input to local memory. + for (int i = lid * elems_per_wi; i < lid * elems_per_wi + elems_per_wi; + i++) { + local_acc[i] = input[i]; + } + ndi.barrier(access::fence_space::local_space); + } + // blocked - sycl_exp::group_load(g, input.begin(), span{data}, blocked); + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_load(g, local_acc.begin(), span{data}, blocked); + } else { + sycl_exp::group_load(g, input.begin(), span{data}, blocked); + } for (int i = 0; i < elems_per_wi; ++i) load_blocked[gid * elems_per_wi + i] = data[i]; // striped - sycl_exp::group_load(g, input.begin(), span{data}, striped); + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_load(g, local_acc.begin(), span{data}, striped); + } else { + sycl_exp::group_load(g, input.begin(), span{data}, striped); + } for (int i = 0; i < elems_per_wi; ++i) load_striped[gid * elems_per_wi + i] = data[i]; // Stores: - std::iota(std::begin(data), std::end(data), gid * elems_per_wi); - - sycl_exp::group_store(g, span{data}, store_blocked.begin(), blocked); - sycl_exp::group_store(g, span{data}, store_striped.begin(), striped); + auto copy_local_acc_to_global_output = [&](accessor output) { + for (int i = lid * elems_per_wi; i < lid * elems_per_wi + elems_per_wi; + i++) { + output[i] = local_acc[i]; + } + }; + + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_store(g, span{data}, local_acc.begin(), blocked); + copy_local_acc_to_global_output(store_blocked); + } else { + sycl_exp::group_store(g, span{data}, store_blocked.begin(), blocked); + } + + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_store(g, span{data}, local_acc.begin(), striped); + copy_local_acc_to_global_output(store_striped); + } else { + sycl_exp::group_store(g, span{data}, store_striped.begin(), striped); + } }); }); @@ -119,3 +152,10 @@ int main() { return 0; } + +int main() { + queue q; + test(q); + test(q); + return 0; +} diff --git a/sycl/test-e2e/GroupAlgorithm/load_store/odd_wg_size.cpp b/sycl/test-e2e/GroupAlgorithm/load_store/odd_wg_size.cpp index c778631ccc05f..2b3a3d0f26fff 100644 --- a/sycl/test-e2e/GroupAlgorithm/load_store/odd_wg_size.cpp +++ b/sycl/test-e2e/GroupAlgorithm/load_store/odd_wg_size.cpp @@ -12,7 +12,8 @@ namespace sycl_exp = sycl::ext::oneapi::experimental; // Similar to partial_sg.cpp, but check group (vs. sub_group) loads/stores when // WG_SIZE isn't equally divisible by SG_SIZE. -template void test(queue &q) { +template +void test(queue &q) { constexpr std::size_t wg_size = WG_SIZE; constexpr std::size_t n_wgs = 2; constexpr std::size_t global_size = n_wgs * wg_size; @@ -39,10 +40,12 @@ template void test(queue &q) { accessor store_blocked{store_blocked_buf, cgh}; accessor store_striped{store_striped_buf, cgh}; + local_accessor local_acc{wg_size * elems_per_wi, cgh}; cgh.parallel_for( nd_range<1>{global_size, wg_size}, [=](nd_item<1> ndi) [[sycl::reqd_sub_group_size(SG_SIZE)]] { auto gid = ndi.get_global_id(0); + auto lid = ndi.get_local_id(0); auto g = ndi.get_group(); auto offset = g.get_group_id(0) * g.get_local_range(0) * elems_per_wi; @@ -51,13 +54,32 @@ template void test(queue &q) { auto blocked = sycl_exp::properties{sycl_exp::data_placement_blocked}; auto striped = sycl_exp::properties{sycl_exp::data_placement_striped}; + if constexpr (addr_space == access::address_space::local_space) { + // Copy input to local memory. + for (int i = lid * elems_per_wi; + i < lid * elems_per_wi + elems_per_wi; i++) { + local_acc[i] = input[offset + i]; + } + ndi.barrier(access::fence_space::local_space); + } + // blocked - sycl_exp::group_load(g, input.begin() + offset, span{data}, blocked); + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_load(g, local_acc.begin(), span{data}, blocked); + } else { + sycl_exp::group_load(g, input.begin() + offset, span{data}, + blocked); + } for (int i = 0; i < elems_per_wi; ++i) load_blocked[gid * elems_per_wi + i] = data[i]; // striped - sycl_exp::group_load(g, input.begin() + offset, span{data}, striped); + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_load(g, local_acc.begin(), span{data}, striped); + } else { + sycl_exp::group_load(g, input.begin() + offset, span{data}, + striped); + } for (int i = 0; i < elems_per_wi; ++i) load_striped[gid * elems_per_wi + i] = data[i]; @@ -65,10 +87,28 @@ template void test(queue &q) { std::iota(std::begin(data), std::end(data), gid * elems_per_wi); - sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset, - blocked); - sycl_exp::group_store(g, span{data}, store_striped.begin() + offset, - striped); + auto copy_local_acc_to_global_output = [&](accessor output) { + for (int i = lid * elems_per_wi; + i < lid * elems_per_wi + elems_per_wi; i++) { + output[offset + i] = local_acc[i]; + } + }; + + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_store(g, span{data}, local_acc.begin(), blocked); + copy_local_acc_to_global_output(store_blocked); + } else { + sycl_exp::group_store(g, span{data}, store_blocked.begin() + offset, + blocked); + } + + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_store(g, span{data}, local_acc.begin(), striped); + copy_local_acc_to_global_output(store_striped); + } else { + sycl_exp::group_store(g, span{data}, store_striped.begin() + offset, + striped); + } }); }); @@ -110,8 +150,10 @@ int main() { if (std::none_of(device_sg_sizes.begin(), device_sg_sizes.end(), [](auto x) { return x == sg_size; })) return; - test(q); - test(q); + test(q); + test(q); + test(q); + test(q); }); return 0; diff --git a/sycl/test-e2e/GroupAlgorithm/load_store/partial_sg.cpp b/sycl/test-e2e/GroupAlgorithm/load_store/partial_sg.cpp index 9ca21ae8a2ee9..b5d50adb88ae1 100644 --- a/sycl/test-e2e/GroupAlgorithm/load_store/partial_sg.cpp +++ b/sycl/test-e2e/GroupAlgorithm/load_store/partial_sg.cpp @@ -1,16 +1,16 @@ // RUN: %{build} -Wno-error=incorrect-sub-group-size -o %t.out // RUN: %{run} %t.out +#include #include #include +#include #include -#include - using namespace sycl; namespace sycl_exp = sycl::ext::oneapi::experimental; -template void test(queue &q) { +template void test(queue &q) { // Verify scenario when the last sub_group isn't full. constexpr std::size_t wg_size = SG_SIZE * 3 / 2; constexpr std::size_t elems_per_wi = 4; @@ -36,6 +36,7 @@ template void test(queue &q) { accessor store_blocked{store_blocked_buf, cgh}; accessor store_striped{store_striped_buf, cgh}; + local_accessor local_acc{wg_size * elems_per_wi, cgh}; cgh.parallel_for( nd_range<1>{wg_size, wg_size}, [=](nd_item<1> ndi) [[sycl::reqd_sub_group_size(SG_SIZE)]] { @@ -49,13 +50,34 @@ template void test(queue &q) { auto blocked = sycl_exp::properties{sycl_exp::data_placement_blocked}; auto striped = sycl_exp::properties{sycl_exp::data_placement_striped}; + if constexpr (addr_space == access::address_space::local_space) { + // Copy input to local memory. + for (int i = sg.get_local_id() * elems_per_wi; + i < sg.get_local_id() * elems_per_wi + elems_per_wi; i++) { + local_acc[offset + i] = input[offset + i]; + } + group_barrier(sg); + } + // blocked - sycl_exp::group_load(sg, input.begin() + offset, span{data}, blocked); + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_load(sg, local_acc.begin() + offset, span{data}, + blocked); + } else { + sycl_exp::group_load(sg, input.begin() + offset, span{data}, + blocked); + } for (int i = 0; i < elems_per_wi; ++i) load_blocked[gid * elems_per_wi + i] = data[i]; // striped - sycl_exp::group_load(sg, input.begin() + offset, span{data}, striped); + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_load(sg, local_acc.begin() + offset, span{data}, + striped); + } else { + sycl_exp::group_load(sg, input.begin() + offset, span{data}, + striped); + } for (int i = 0; i < elems_per_wi; ++i) load_striped[gid * elems_per_wi + i] = data[i]; @@ -63,10 +85,30 @@ template void test(queue &q) { std::iota(std::begin(data), std::end(data), gid * elems_per_wi); - sycl_exp::group_store(sg, span{data}, store_blocked.begin() + offset, - blocked); - sycl_exp::group_store(sg, span{data}, store_striped.begin() + offset, - striped); + auto copy_local_acc_to_global_output = [&](accessor output) { + for (int i = sg.get_local_id() * elems_per_wi; + i < sg.get_local_id() * elems_per_wi + elems_per_wi; i++) { + output[offset + i] = local_acc[offset + i]; + } + }; + + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_store(sg, span{data}, local_acc.begin() + offset, + blocked); + copy_local_acc_to_global_output(store_blocked); + } else { + sycl_exp::group_store(sg, span{data}, + store_blocked.begin() + offset, blocked); + } + + if constexpr (addr_space == access::address_space::local_space) { + sycl_exp::group_store(sg, span{data}, local_acc.begin() + offset, + striped); + copy_local_acc_to_global_output(store_striped); + } else { + sycl_exp::group_store(sg, span{data}, + store_striped.begin() + offset, striped); + } }); }); @@ -110,8 +152,10 @@ int main() { detail::loop([&](auto sg_size_idx) { constexpr auto sg_size = sg_sizes[sg_size_idx]; if (std::any_of(device_sg_sizes.begin(), device_sg_sizes.end(), - [](auto x) { return x == sg_size; })) - test(q); + [](auto x) { return x == sg_size; })) { + test(q); + test(q); + } }); return 0; diff --git a/sycl/test/check_device_code/group_load.cpp b/sycl/test/check_device_code/group_load.cpp index 1da28fb4107b3..56c79401b1966 100644 --- a/sycl/test/check_device_code/group_load.cpp +++ b/sycl/test/check_device_code/group_load.cpp @@ -1,5 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clangxx -O3 -fsycl -fsycl-device-only -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck %s +// RUN: %clangxx -DGLOBAL_SPACE -O3 -fsycl -fsycl-device-only -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck --check-prefix CHECK-GLOBAL %s +// RUN: %clangxx -DLOCAL_SPACE -O3 -fsycl -fsycl-device-only -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck --check-prefix CHECK-LOCAL %s // Windows/linux have some slight differences in IR generation (function // arguments passing and long/long long differences/mangling) that could @@ -15,91 +16,164 @@ namespace oneapi_exp = sycl::ext::oneapi::experimental; using namespace sycl::ext::oneapi::experimental; using full_group_blocked = - decltype(properties(full_group, data_placement_blocked)); + decltype(properties(full_group, data_placement_blocked, + oneapi_exp::detail::native_local_block_io)); using naive_blocked = - decltype(properties(oneapi_exp::detail::naive, data_placement_blocked)); + decltype(properties(oneapi_exp::detail::naive, data_placement_blocked, + oneapi_exp::detail::native_local_block_io)); using opt_blocked = - decltype(properties(full_group, contiguous_memory, data_placement_blocked)); + decltype(properties(full_group, contiguous_memory, data_placement_blocked, + oneapi_exp::detail::native_local_block_io)); using full_group_striped = - decltype(properties(full_group, data_placement_striped)); + decltype(properties(full_group, data_placement_striped, + oneapi_exp::detail::native_local_block_io)); using naive_striped = - decltype(properties(oneapi_exp::detail::naive, data_placement_striped)); + decltype(properties(oneapi_exp::detail::naive, data_placement_striped, + oneapi_exp::detail::native_local_block_io)); using opt_striped = - decltype(properties(full_group, contiguous_memory, data_placement_striped)); + decltype(properties(full_group, contiguous_memory, data_placement_striped, + oneapi_exp::detail::native_local_block_io)); +#ifdef GLOBAL_SPACE template -using plain_global_ptr = typename sycl::detail::DecoratedType< +using plain_ptr = typename sycl::detail::DecoratedType< T, access::address_space::global_space>::type *; +#else +template +using plain_ptr = typename sycl::detail::DecoratedType< + T, access::address_space::local_space>::type *; +#endif namespace blocked { -// CHECK-LABEL: @_ZN7blocked10test_naiveERN4sycl3_V19sub_groupEPU3AS1iRi( -// CHECK-NEXT: entry: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4:[0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8:![0-9]+]] -// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, plain_global_ptr p, - int &out) { +// CHECK-GLOBAL-LABEL: @_ZN7blocked10test_naiveERN4sycl3_V19sub_groupEPU3AS1iRi( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4:[0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8:![0-9]+]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked10test_naiveERN4sycl3_V19sub_groupEPU3AS3iRi( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5:[0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8:![0-9]+]] +// CHECK-LOCAL-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, plain_ptr p, int &out) { // Ensure `detail::naive` always results in no block loads/stores. group_load(sg, p, out, naive_blocked{}); } -// CHECK-LABEL: @_ZN7blocked14test_optimizedERN4sycl3_V19sub_groupEPU3AS1iRi( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) -// CHECK-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] -// CHECK-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked14test_optimizedERN4sycl3_V19sub_groupEPU3AS1iRi( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 +// CHECK-GLOBAL-NEXT: ret void // -SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, plain_global_ptr p, +// CHECK-LOCAL-LABEL: @_ZN7blocked14test_optimizedERN4sycl3_V19sub_groupEPU3AS3iRi( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-LOCAL-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS3Kj(ptr addrspace(3) noundef nonnull [[P]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, plain_ptr p, int &out) { // Check that optimized implementation is selected. group_load(sg, p, out, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked27test_contiguous_auto_detectERN4sycl3_V19sub_groupEPU3AS1iRi( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) -// CHECK-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] -// CHECK-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked27test_contiguous_auto_detectERN4sycl3_V19sub_groupEPU3AS1iRi( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked27test_contiguous_auto_detectERN4sycl3_V19sub_groupEPU3AS3iRi( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-LOCAL-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS3Kj(ptr addrspace(3) noundef nonnull [[P]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_contiguous_auto_detect(sycl::sub_group &sg, - plain_global_ptr p, - int &out) { + plain_ptr p, int &out) { // Check that optimized implementation is selected. group_load(sg, p, out, full_group_blocked{}); } // SYCL 2020's accessor can't be statically known to be contiguous. +#ifdef GLOBAL_SPACE using accessor_iter_t = accessor::iterator; +#else +using accessor_iter_t = local_accessor::iterator; +#endif -// CHECK-LABEL: @_ZN7blocked18test_accessor_iterERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEERi( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15:![0-9]+]] -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18:![0-9]+]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[CONV3_I_I_I:%.*]] = sext i32 [[TMP0]] to i64 -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP1]], i64 [[CONV3_I_I_I]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked18test_accessor_iterERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEERi( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18:![0-9]+]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-GLOBAL-NEXT: [[CONV3_I_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-GLOBAL-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP1]], i64 [[CONV3_I_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked18test_accessor_iterERN4sycl3_V19sub_groupERPiRi( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15:![0-9]+]], !nonnull [[META7:![0-9]+]], !noundef [[META7]] +// CHECK-LOCAL-NEXT: [[CALL_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(3) @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPvi(ptr addrspace(4) noundef nonnull [[TMP0]], i32 noundef 4) #[[ATTR6:[0-9]+]] +// CHECK-LOCAL-NEXT: [[TOBOOL5_NOT_I_I:%.*]] = icmp eq ptr addrspace(3) [[CALL_I_I_I]], null +// CHECK-LOCAL-NEXT: br i1 [[TOBOOL5_NOT_I_I]], label [[IF_ELSE_I_I:%.*]], label [[IF_THEN6_I_I:%.*]] +// CHECK-LOCAL: if.then6.i.i: +// CHECK-LOCAL-NEXT: [[CALL7_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS3Kj(ptr addrspace(3) noundef nonnull [[CALL_I_I_I]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: br label [[IF_END17_I_I:%.*]] +// CHECK-LOCAL: if.else.i.i: +// CHECK-LOCAL-NEXT: [[CALL_I38_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef nonnull [[TMP0]], i32 noundef 5) #[[ATTR6]] +// CHECK-LOCAL-NEXT: [[TOBOOL9_NOT_NOT_I_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I38_I_I]], null +// CHECK-LOCAL-NEXT: br i1 [[TOBOOL9_NOT_NOT_I_I]], label [[CLEANUP_THREAD_I_I:%.*]], label [[CLEANUP_I_I:%.*]] +// CHECK-LOCAL: cleanup.thread.i.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[IDXPROM_I43_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I44_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 [[IDXPROM_I43_I_I]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I44_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPIINS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS8_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESO_SM_RSN_SP__EXIT:%.*]] +// CHECK-LOCAL: cleanup.i.i: +// CHECK-LOCAL-NEXT: [[CALL11_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[CALL_I38_I_I]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: br label [[IF_END17_I_I]] +// CHECK-LOCAL: if.end17.i.i: +// CHECK-LOCAL-NEXT: [[LOAD_1_I_I:%.*]] = phi i32 [ [[CALL11_I_I]], [[CLEANUP_I_I]] ], [ [[CALL7_I_I]], [[IF_THEN6_I_I]] ] +// CHECK-LOCAL-NEXT: store i32 [[LOAD_1_I_I]], ptr addrspace(4) [[OUT]], align 4 +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPIINS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS8_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESO_SM_RSN_SP__EXIT]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPiiNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_14full_group_keyEJEEENSA_INS8_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESO_SM_RSN_SP_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, accessor_iter_t &iter, int &out) { @@ -107,32 +181,74 @@ SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, group_load(sg, iter, out, full_group_blocked{}); } -// CHECK-LABEL: @_ZN7blocked34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEERi( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] -// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[CMP_I_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I_I]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I_I]]) -// CHECK-NEXT: [[CALL_I_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I_I]], i32 noundef 5) #[[ATTR5:[0-9]+]] -// CHECK-NEXT: [[TOBOOL_NOT_I_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I_I_I]], null -// CHECK-NEXT: br i1 [[TOBOOL_NOT_I_I]], label [[IF_THEN_I_I:%.*]], label [[IF_END_I_I:%.*]] -// CHECK: if.then.i.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[CONV3_I_I_I:%.*]] = sext i32 [[TMP0]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], i64 [[CONV3_I_I_I]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEINS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_SP_RSQ_SS__EXIT:%.*]] -// CHECK: if.end.i.i: -// CHECK-NEXT: [[CALL6_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[CALL_I_I_I_I_I]]) #[[ATTR4]] -// CHECK-NEXT: store i32 [[CALL6_I_I]], ptr addrspace(4) [[OUT]], align 4 -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEINS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_SP_RSQ_SS__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESR_SP_RSQ_SS_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEERi( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] +// CHECK-GLOBAL-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I_I]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[CALL_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(3) @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I_I]], i32 noundef 4) #[[ATTR5:[0-9]+]] +// CHECK-GLOBAL-NEXT: [[TOBOOL7_NOT_I_I:%.*]] = icmp eq ptr addrspace(3) [[CALL_I_I_I]], null +// CHECK-GLOBAL-NEXT: br i1 [[TOBOOL7_NOT_I_I]], label [[IF_ELSE_I_I:%.*]], label [[IF_THEN8_I_I:%.*]] +// CHECK-GLOBAL: if.then8.i.i: +// CHECK-GLOBAL-NEXT: [[CALL9_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS3Kj(ptr addrspace(3) noundef nonnull [[CALL_I_I_I]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: br label [[IF_END20_I_I:%.*]] +// CHECK-GLOBAL: if.else.i.i: +// CHECK-GLOBAL-NEXT: [[CALL_I41_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I_I]], i32 noundef 5) #[[ATTR5]] +// CHECK-GLOBAL-NEXT: [[TOBOOL11_NOT_NOT_I_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I41_I_I]], null +// CHECK-GLOBAL-NEXT: br i1 [[TOBOOL11_NOT_NOT_I_I]], label [[CLEANUP_THREAD_I_I:%.*]], label [[CLEANUP_I_I:%.*]] +// CHECK-GLOBAL: cleanup.thread.i.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-GLOBAL-NEXT: [[CONV3_I48_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-GLOBAL-NEXT: [[ADD_PTR_I_I_I52_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], i64 [[CONV3_I48_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I52_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEINS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEENSD_INSB_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEEST_SR_RSS_SU__EXIT:%.*]] +// CHECK-GLOBAL: cleanup.i.i: +// CHECK-GLOBAL-NEXT: [[CALL13_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[CALL_I41_I_I]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: br label [[IF_END20_I_I]] +// CHECK-GLOBAL: if.end20.i.i: +// CHECK-GLOBAL-NEXT: [[LOAD_1_I_I:%.*]] = phi i32 [ [[CALL13_I_I]], [[CLEANUP_I_I]] ], [ [[CALL9_I_I]], [[IF_THEN8_I_I]] ] +// CHECK-GLOBAL-NEXT: store i32 [[LOAD_1_I_I]], ptr addrspace(4) [[OUT]], align 4 +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEINS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEENSD_INSB_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEEST_SR_RSS_SU__EXIT]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEENSD_INSB_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeEST_SR_RSS_SU_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupERPiRi( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]], !nonnull [[META7]], !noundef [[META7]] +// CHECK-LOCAL-NEXT: [[CALL_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(3) @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPvi(ptr addrspace(4) noundef nonnull [[TMP0]], i32 noundef 4) #[[ATTR6]] +// CHECK-LOCAL-NEXT: [[TOBOOL5_NOT_I_I:%.*]] = icmp eq ptr addrspace(3) [[CALL_I_I_I]], null +// CHECK-LOCAL-NEXT: br i1 [[TOBOOL5_NOT_I_I]], label [[IF_ELSE_I_I:%.*]], label [[IF_THEN6_I_I:%.*]] +// CHECK-LOCAL: if.then6.i.i: +// CHECK-LOCAL-NEXT: [[CALL7_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS3Kj(ptr addrspace(3) noundef nonnull [[CALL_I_I_I]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: br label [[IF_END17_I_I:%.*]] +// CHECK-LOCAL: if.else.i.i: +// CHECK-LOCAL-NEXT: [[CALL_I38_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef nonnull [[TMP0]], i32 noundef 5) #[[ATTR6]] +// CHECK-LOCAL-NEXT: [[TOBOOL9_NOT_NOT_I_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I38_I_I]], null +// CHECK-LOCAL-NEXT: br i1 [[TOBOOL9_NOT_NOT_I_I]], label [[CLEANUP_THREAD_I_I:%.*]], label [[CLEANUP_I_I:%.*]] +// CHECK-LOCAL: cleanup.thread.i.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[IDXPROM_I43_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I44_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 [[IDXPROM_I43_I_I]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I44_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: store i32 [[TMP2]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPIINS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS8_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESQ_SO_RSP_SR__EXIT:%.*]] +// CHECK-LOCAL: cleanup.i.i: +// CHECK-LOCAL-NEXT: [[CALL11_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[CALL_I38_I_I]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: br label [[IF_END17_I_I]] +// CHECK-LOCAL: if.end17.i.i: +// CHECK-LOCAL-NEXT: [[LOAD_1_I_I:%.*]] = phi i32 [ [[CALL11_I_I]], [[CLEANUP_I_I]] ], [ [[CALL7_I_I]], [[IF_THEN6_I_I]] ] +// CHECK-LOCAL-NEXT: store i32 [[LOAD_1_I_I]], ptr addrspace(4) [[OUT]], align 4 +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPIINS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSA_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSA_INS3_14FULL_GROUP_KEYEJEEENSA_INS8_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESQ_SO_RSP_SR__EXIT]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPiiNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS8_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESQ_SO_RSP_SR_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, accessor_iter_t &iter, @@ -141,178 +257,316 @@ SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, group_load(sg, iter, out, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked24test_runtime_align_checkERN4sycl3_V19sub_groupEPU3AS1cRc( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 3 -// CHECK-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] -// CHECK: if.then.i.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I_I]] -// CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I_I_I]], align 1, !tbaa [[TBAA22:![0-9]+]] -// CHECK-NEXT: store i8 [[TMP2]], ptr addrspace(4) [[OUT:%.*]], align 1, !tbaa [[TBAA22]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_SN_RSO_SQ__EXIT:%.*]] -// CHECK: if.end.i.i: -// CHECK-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef zeroext i8 @_Z30__spirv_SubgroupBlockReadINTELIhET_PU3AS1Kh(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] -// CHECK-NEXT: store i8 [[CALL4_I_I]], ptr addrspace(4) [[OUT]], align 1 -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_SN_RSO_SQ__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESP_SN_RSO_SQ_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked24test_runtime_align_checkERN4sycl3_V19sub_groupEPU3AS1cRc( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 3 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK-GLOBAL: if.then.i.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I_I_I]], align 1, !tbaa [[TBAA22:![0-9]+]] +// CHECK-GLOBAL-NEXT: store i8 [[TMP2]], ptr addrspace(4) [[OUT:%.*]], align 1, !tbaa [[TBAA22]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_SP_RSQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i.i: +// CHECK-GLOBAL-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef zeroext i8 @_Z30__spirv_SubgroupBlockReadINTELIhET_PU3AS1Kh(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: store i8 [[CALL4_I_I]], ptr addrspace(4) [[OUT]], align 1 +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_SP_RSQ_SS__EXIT]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESR_SP_RSQ_SS_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked24test_runtime_align_checkERN4sycl3_V19sub_groupEPU3AS3cRc( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 3 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK-LOCAL: if.then.i.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(3) [[ARRAYIDX_I_I_I]], align 1, !tbaa [[TBAA20:![0-9]+]] +// CHECK-LOCAL-NEXT: store i8 [[TMP2]], ptr addrspace(4) [[OUT:%.*]], align 1, !tbaa [[TBAA20]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3CCNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_SP_RSQ_SS__EXIT:%.*]] +// CHECK-LOCAL: if.end.i.i: +// CHECK-LOCAL-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef zeroext i8 @_Z30__spirv_SubgroupBlockReadINTELIhET_PU3AS3Kh(ptr addrspace(3) noundef nonnull [[P]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: store i8 [[CALL4_I_I]], ptr addrspace(4) [[OUT]], align 1 +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3CCNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_SP_RSQ_SS__EXIT]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3ccNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESR_SP_RSQ_SS_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_runtime_align_check(sycl::sub_group &sg, - plain_global_ptr p, - char &out) { + plain_ptr p, char &out) { // Run-time alignment check is needed if type's alignment is less than // BlockRead requirements. group_load(sg, p, out, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked16test_four_shortsERN4sycl3_V19sub_groupEPU3AS1sNS1_4spanIsLm4EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA24:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META26:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA29:![0-9]+]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i16 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA29]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP31:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef i64 @_Z30__spirv_SubgroupBlockReadINTELImET_PU3AS1Km(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] -// CHECK-NEXT: store i64 [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, - plain_global_ptr p, +// CHECK-GLOBAL-LABEL: @_ZN7blocked16test_four_shortsERN4sycl3_V19sub_groupEPU3AS1sNS1_4spanIsLm4EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA24:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-GLOBAL: if.then.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META26:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEESU__EXIT_I:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA29:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: store i16 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA29]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP31:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEESU_.exit.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i: +// CHECK-GLOBAL-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef i64 @_Z30__spirv_SubgroupBlockReadINTELImET_PU3AS1Km(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: store i64 [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked16test_four_shortsERN4sycl3_V19sub_groupEPU3AS3sNS1_4spanIsLm4EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA22:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-LOCAL: if.then.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META24:![0-9]+]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEESU__EXIT_I:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(3) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA27:![0-9]+]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: store i16 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP29:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEESU_.exit.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-LOCAL: if.end.i: +// CHECK-LOCAL-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef i64 @_Z30__spirv_SubgroupBlockReadINTELImET_PU3AS3Km(ptr addrspace(3) noundef nonnull [[P]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: store i64 [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, plain_ptr p, span out) { // Four shorts in blocked data layout could be loaded as a single 64-bit // integer. group_load(sg, p, out, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked21test_non_power_of_twoERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm3EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META34:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 3 -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM3ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP37:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_non_power_of_two(sycl::sub_group &sg, - plain_global_ptr p, +// CHECK-GLOBAL-LABEL: @_ZN7blocked21test_non_power_of_twoERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm3EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META34:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 3 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM3ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP37:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked21test_non_power_of_twoERN4sycl3_V19sub_groupEPU3AS3iNS1_4spanIiLm3EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META32:![0-9]+]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 3 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3IILM3ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP35:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_non_power_of_two(sycl::sub_group &sg, plain_ptr p, span out) { // Check for non-power-of-two size. group_load(sg, p, out, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked14test_four_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm4EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META39:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP2]], 2 -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP42:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_four_ints(sycl::sub_group &sg, plain_global_ptr p, +// CHECK-GLOBAL-LABEL: @_ZN7blocked14test_four_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm4EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META39:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP2]], 2 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP42:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked14test_four_intsERN4sycl3_V19sub_groupEPU3AS3iNS1_4spanIiLm4EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META37:![0-9]+]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP2]], 2 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3IILM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP40:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3iiLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_four_ints(sycl::sub_group &sg, plain_ptr p, span out) { // Four int elements in blocked data layout don't map directly to any // BlockRead API. group_load(sg, p, out, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked15test_seven_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm7EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META44:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 7 -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 7 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM7ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP47:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm7ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_seven_ints(sycl::sub_group &sg, plain_global_ptr p, +// CHECK-GLOBAL-LABEL: @_ZN7blocked15test_seven_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm7EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META44:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 7 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 7 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM7ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP47:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm7ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked15test_seven_intsERN4sycl3_V19sub_groupEPU3AS3iNS1_4spanIiLm7EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META42:![0-9]+]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 7 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 7 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3IILM7ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP3]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP45:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3iiLm7ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_seven_ints(sycl::sub_group &sg, plain_ptr p, span out) { // Similar to four elements case but more complex to optimize. group_load(sg, p, out, opt_blocked{}); @@ -323,103 +577,162 @@ namespace striped { // Striped data layout with one element per work item isn't different from // blocked data layout, so use span version only in the checks below. -// CHECK-LABEL: @_ZN7striped10test_naiveERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm2EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META49:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META52:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESN_SL_NS0_4SPANISM_XT2_EEESO__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP55:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESN_SL_NS0_4spanISM_XT2_EEESO_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, plain_global_ptr p, +// CHECK-GLOBAL-LABEL: @_ZN7striped10test_naiveERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm2EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META49:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META52:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-GLOBAL: for.cond.i: +// CHECK-GLOBAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i: +// CHECK-GLOBAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-GLOBAL-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP55:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped10test_naiveERN4sycl3_V19sub_groupEPU3AS3iNS1_4spanIiLm2EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META47:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META50:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-LOCAL: for.cond.i: +// CHECK-LOCAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3IILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] +// CHECK-LOCAL: for.body.i: +// CHECK-LOCAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-LOCAL-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP53:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3iiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, plain_ptr p, span out) { // Ensure `detail::naive` always results in no block loads/stores. group_load(sg, p, out, naive_striped{}); } -// CHECK-LABEL: @_ZN7striped14test_optimizedERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm2EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] -// CHECK-NEXT: store <2 x i32> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 4 -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, plain_global_ptr p, +// CHECK-GLOBAL-LABEL: @_ZN7striped14test_optimizedERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm2EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-GLOBAL-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: store <2 x i32> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 4 +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped14test_optimizedERN4sycl3_V19sub_groupEPU3AS3iNS1_4spanIiLm2EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-LOCAL-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS3Kj(ptr addrspace(3) noundef nonnull [[P]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: store <2 x i32> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 4 +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, plain_ptr p, span out) { // Check that optimized implementation is selected. group_load(sg, p, out, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped27test_contiguous_auto_detectERN4sycl3_V19sub_groupEPU3AS1iRi( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) -// CHECK-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] -// CHECK-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped27test_contiguous_auto_detectERN4sycl3_V19sub_groupEPU3AS1iRi( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped27test_contiguous_auto_detectERN4sycl3_V19sub_groupEPU3AS3iRi( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-LOCAL-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS3Kj(ptr addrspace(3) noundef nonnull [[P]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_contiguous_auto_detect(sycl::sub_group &sg, - plain_global_ptr p, - int &out) { + plain_ptr p, int &out) { // Check that optimized implementation is selected. group_load(sg, p, out, full_group_striped{}); } // SYCL 2020's accessor can't be statically known to be contiguous. +#ifdef GLOBAL_SPACE using accessor_iter_t = accessor::iterator; +#else +using accessor_iter_t = local_accessor::iterator; +#endif -// CHECK-LABEL: @_ZN7striped18test_accessor_iterERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEENS1_4spanIiLm2EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META59:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META62:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] -// CHECK-NEXT: [[CONV3_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV3_I_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP65:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped18test_accessor_iterERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEENS1_4spanIiLm2EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META59:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META62:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_14FULL_GROUP_KEYEJEEENSD_INSB_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[CONV3_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV3_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP65:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_14full_group_keyEJEEENSD_INSB_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped18test_accessor_iterERN4sycl3_V19sub_groupERPiNS1_4spanIiLm2EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[AGG_TMP:%.*]] = alloca %"struct.sycl::_V1::sub_group", align 1 +// CHECK-LOCAL-NEXT: [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::span.22", align 8 +// CHECK-LOCAL-NEXT: [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::experimental::properties.30", align 1 +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: store i64 [[TMP1]], ptr [[AGG_TMP1]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPiiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_14full_group_keyEJEEENSA_INS8_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESO_SM_NS0_4spanISN_XT2_EEESP_(ptr noundef nonnull byval(%"struct.sycl::_V1::sub_group") align 1 [[AGG_TMP]], ptr addrspace(4) noundef [[TMP0]], ptr noundef nonnull byval(%"class.sycl::_V1::span.22") align 8 [[AGG_TMP1]], ptr noundef nonnull byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.30") align 1 [[AGG_TMP2]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, accessor_iter_t &iter, span out) { @@ -427,48 +740,69 @@ SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, group_load(sg, iter, out, full_group_striped{}); } -// CHECK-LABEL: @_ZN7striped34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEENS1_4spanIiLm2EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) -// CHECK-NEXT: [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I]], i32 noundef 5) #[[ATTR5]] -// CHECK-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I_I]], null -// CHECK-NEXT: br i1 [[TOBOOL_NOT_I]], label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META67:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META70:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEENSD_INSB_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEESU__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] -// CHECK-NEXT: [[CONV3_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], i64 [[CONV3_I_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP73:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEENSD_INSB_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEESU_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: [[CALL6_I:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[CALL_I_I_I_I]]) #[[ATTR4]] -// CHECK-NEXT: store <2 x i32> [[CALL6_I]], ptr addrspace(4) [[TMP1]], align 4 -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupERNS1_6detail17accessor_iteratorIKiLi1EEENS1_4spanIiLm2EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[CALL_I_I:%.*]] = tail call spir_func noundef ptr addrspace(3) @_Z40__spirv_GenericCastToPtrExplicit_ToLocalPKvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I]], i32 noundef 4) #[[ATTR5]] +// CHECK-GLOBAL-NEXT: [[TOBOOL7_NOT_I:%.*]] = icmp eq ptr addrspace(3) [[CALL_I_I]], null +// CHECK-GLOBAL-NEXT: br i1 [[TOBOOL7_NOT_I]], label [[IF_ELSE_I:%.*]], label [[IF_THEN8_I:%.*]] +// CHECK-GLOBAL: if.then8.i: +// CHECK-GLOBAL-NEXT: [[CALL9_I:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS3Kj(ptr addrspace(3) noundef nonnull [[CALL_I_I]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: br label [[IF_END20_I:%.*]] +// CHECK-GLOBAL: if.else.i: +// CHECK-GLOBAL-NEXT: [[CALL_I41_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPKvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I]], i32 noundef 5) #[[ATTR5]] +// CHECK-GLOBAL-NEXT: [[TOBOOL11_NOT_NOT_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I41_I]], null +// CHECK-GLOBAL-NEXT: br i1 [[TOBOOL11_NOT_NOT_I]], label [[IF_ELSE14_I:%.*]], label [[CLEANUP_THREAD_I:%.*]] +// CHECK-GLOBAL: cleanup.thread.i: +// CHECK-GLOBAL-NEXT: [[CALL13_I:%.*]] = tail call spir_func noundef <2 x i32> @_Z30__spirv_SubgroupBlockReadINTELIDv2_jET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[CALL_I41_I]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: br label [[IF_END20_I]] +// CHECK-GLOBAL: if.else14.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META67:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META70:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I51_I:%.*]] +// CHECK-GLOBAL: for.cond.i51.i: +// CHECK-GLOBAL-NEXT: [[I_0_I52_I:%.*]] = phi i32 [ 0, [[IF_ELSE14_I]] ], [ [[INC_I61_I:%.*]], [[FOR_BODY_I54_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I53_I:%.*]] = icmp samesign ult i32 [[I_0_I52_I]], 2 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I53_I]], label [[FOR_BODY_I54_I]], label [[CLEANUP_I:%.*]] +// CHECK-GLOBAL: for.body.i54.i: +// CHECK-GLOBAL-NEXT: [[CONV_I55_I:%.*]] = zext nneg i32 [[I_0_I52_I]] to i64 +// CHECK-GLOBAL-NEXT: [[MUL_I_I56_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I52_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I57_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I56_I]] +// CHECK-GLOBAL-NEXT: [[CONV3_I58_I:%.*]] = sext i32 [[ADD_I_I57_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ADD_PTR_I_I_I59_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], i64 [[CONV3_I58_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ADD_PTR_I_I_I59_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I60_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I55_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I60_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I61_I]] = add nuw nsw i32 [[I_0_I52_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I51_I]], !llvm.loop [[LOOP73:![0-9]+]] +// CHECK-GLOBAL: cleanup.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEENSD_INSB_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEESU__EXIT:%.*]] +// CHECK-GLOBAL: if.end20.i: +// CHECK-GLOBAL-NEXT: [[LOAD_1_I:%.*]] = phi <2 x i32> [ [[CALL9_I]], [[IF_THEN8_I]] ], [ [[CALL13_I]], [[CLEANUP_THREAD_I]] ] +// CHECK-GLOBAL-NEXT: store <2 x i32> [[LOAD_1_I]], ptr addrspace(4) [[TMP1]], align 4 +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPENS0_6DETAIL17ACCESSOR_ITERATORIKILI1EEEILM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSD_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSD_INS3_14FULL_GROUP_KEYEJEEENSD_INSB_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEESU__EXIT]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupENS0_6detail17accessor_iteratorIKiLi1EEEiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEENSD_INSB_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEESU_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupERPiNS1_4spanIiLm2EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[AGG_TMP:%.*]] = alloca %"struct.sycl::_V1::sub_group", align 1 +// CHECK-LOCAL-NEXT: [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::span.22", align 8 +// CHECK-LOCAL-NEXT: [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::experimental::properties.28", align 1 +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: store i64 [[TMP1]], ptr [[AGG_TMP1]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPiiLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSA_INS3_21contiguous_memory_keyEJEEENSA_INS3_14full_group_keyEJEEENSA_INS8_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESQ_SO_NS0_4spanISP_XT2_EEESR_(ptr noundef nonnull byval(%"struct.sycl::_V1::sub_group") align 1 [[AGG_TMP]], ptr addrspace(4) noundef [[TMP0]], ptr noundef nonnull byval(%"class.sycl::_V1::span.22") align 8 [[AGG_TMP1]], ptr noundef nonnull byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.28") align 1 [[AGG_TMP2]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, accessor_iter_t &iter, @@ -477,246 +811,442 @@ SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, group_load(sg, iter, out, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped24test_runtime_align_checkERN4sycl3_V19sub_groupEPU3AS1cNS1_4spanIcLm2EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA75:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META77:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META80:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I_I]], align 1, !tbaa [[TBAA22]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i8 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 1, !tbaa [[TBAA22]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP83:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <2 x i8> @_Z30__spirv_SubgroupBlockReadINTELIDv2_hET_PU3AS1Kh(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] -// CHECK-NEXT: store <2 x i8> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 1 -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped24test_runtime_align_checkERN4sycl3_V19sub_groupEPU3AS1cNS1_4spanIcLm2EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA75:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-GLOBAL: if.then.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META77:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META80:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEESU__EXIT_I:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i8, ptr addrspace(1) [[ARRAYIDX_I_I]], align 1, !tbaa [[TBAA22]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: store i8 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 1, !tbaa [[TBAA22]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP83:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEESU_.exit.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i: +// CHECK-GLOBAL-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <2 x i8> @_Z30__spirv_SubgroupBlockReadINTELIDv2_hET_PU3AS1Kh(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: store <2 x i8> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 1 +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ccLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped24test_runtime_align_checkERN4sycl3_V19sub_groupEPU3AS3cNS1_4spanIcLm2EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA76:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-LOCAL: if.then.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META78:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META81:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEESU__EXIT_I:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i8, ptr addrspace(3) [[ARRAYIDX_I_I]], align 1, !tbaa [[TBAA20]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: store i8 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 1, !tbaa [[TBAA20]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP84:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3ccLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEESU_.exit.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-LOCAL: if.end.i: +// CHECK-LOCAL-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <2 x i8> @_Z30__spirv_SubgroupBlockReadINTELIDv2_hET_PU3AS3Kh(ptr addrspace(3) noundef nonnull [[P]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: store <2 x i8> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 1 +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3CCLM2ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3ccLm2ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_runtime_align_check(sycl::sub_group &sg, - plain_global_ptr p, + plain_ptr p, span out) { // Run-time alignment check is needed if type's alignment is less than // BlockRead requirements. group_load(sg, p, out, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped16test_four_shortsERN4sycl3_V19sub_groupEPU3AS1sNS1_4spanIsLm4EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA24]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META85:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META88:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA29]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i16 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA29]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP91:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <4 x i16> @_Z30__spirv_SubgroupBlockReadINTELIDv4_tET_PU3AS1Kt(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] -// CHECK-NEXT: store <4 x i16> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, - plain_global_ptr p, +// CHECK-GLOBAL-LABEL: @_ZN7striped16test_four_shortsERN4sycl3_V19sub_groupEPU3AS1sNS1_4spanIsLm4EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA24]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-GLOBAL: if.then.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META85:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META88:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEESU__EXIT_I:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA29]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: store i16 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA29]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP91:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEESU_.exit.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i: +// CHECK-GLOBAL-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <4 x i16> @_Z30__spirv_SubgroupBlockReadINTELIDv4_tET_PU3AS1Kt(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: store <4 x i16> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped16test_four_shortsERN4sycl3_V19sub_groupEPU3AS3sNS1_4spanIsLm4EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA22]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-LOCAL: if.then.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META86:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META89:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEESU__EXIT_I:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(3) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: store i16 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP92:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEESU_.exit.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-LOCAL: if.end.i: +// CHECK-LOCAL-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <4 x i16> @_Z30__spirv_SubgroupBlockReadINTELIDv4_tET_PU3AS3Kt(ptr addrspace(3) noundef nonnull [[P]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: store <4 x i16> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3SSLM4ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3ssLm4ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, plain_ptr p, span out) { // Just because there is a blocked data layout testcase, nothing inherently // useful here. group_load(sg, p, out, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped19test_sixteen_shortsERN4sycl3_V19sub_groupEPU3AS1sNS1_4spanIsLm16EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA24]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META93:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META96:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 -// CHECK-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA29]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i16 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA29]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP99:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <16 x i16> @_Z30__spirv_SubgroupBlockReadINTELIDv16_tET_PU3AS1Kt(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] -// CHECK-NEXT: store <16 x i16> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_sixteen_shorts(sycl::sub_group &sg, - plain_global_ptr p, +// CHECK-GLOBAL-LABEL: @_ZN7striped19test_sixteen_shortsERN4sycl3_V19sub_groupEPU3AS1sNS1_4spanIsLm16EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA24]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-GLOBAL: if.then.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META93:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META96:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEESU__EXIT_I:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA29]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: store i16 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA29]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP99:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEESU_.exit.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i: +// CHECK-GLOBAL-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <16 x i16> @_Z30__spirv_SubgroupBlockReadINTELIDv16_tET_PU3AS1Kt(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: store <16 x i16> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1ssLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped19test_sixteen_shortsERN4sycl3_V19sub_groupEPU3AS3sNS1_4spanIsLm16EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA22]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 3 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-LOCAL: if.then.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META94:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META97:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I14_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I14_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_SR_NS0_4SPANISS_XT2_EEESU__EXIT_I:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(3) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: store i16 [[TMP5]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA27]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP100:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3ssLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_SR_NS0_4spanISS_XT2_EEESU_.exit.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-LOCAL: if.end.i: +// CHECK-LOCAL-NEXT: [[CALL4_I:%.*]] = tail call spir_func noundef <16 x i16> @_Z30__spirv_SubgroupBlockReadINTELIDv16_tET_PU3AS3Kt(ptr addrspace(3) noundef nonnull [[P]]) #[[ATTR5]] +// CHECK-LOCAL-NEXT: store <16 x i16> [[CALL4_I]], ptr addrspace(4) [[TMP1]], align 2 +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3SSLM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3ssLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_sixteen_shorts(sycl::sub_group &sg, plain_ptr p, span out) { group_load(sg, p, out, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped21test_non_power_of_twoERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm3EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META93:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META96:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM3ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP99:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_non_power_of_two(sycl::sub_group &sg, - plain_global_ptr p, +// CHECK-GLOBAL-LABEL: @_ZN7striped21test_non_power_of_twoERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm3EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META101:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META104:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM3ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP107:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped21test_non_power_of_twoERN4sycl3_V19sub_groupEPU3AS3iNS1_4spanIiLm3EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META102:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META105:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3IILM3ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP108:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3iiLm3ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_non_power_of_two(sycl::sub_group &sg, plain_ptr p, span out) { // Check for non-power-of-two size. group_load(sg, p, out, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped17test_sixteen_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm16EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META101:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META104:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP107:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_sixteen_ints(sycl::sub_group &sg, - plain_global_ptr p, +// CHECK-GLOBAL-LABEL: @_ZN7striped17test_sixteen_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm16EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META109:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META112:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP115:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped17test_sixteen_intsERN4sycl3_V19sub_groupEPU3AS3iNS1_4spanIiLm16EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META110:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META113:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3IILM16ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP116:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3iiLm16ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_sixteen_ints(sycl::sub_group &sg, plain_ptr p, span out) { // Even though power of two, still too many to map directly onto BloadRead // API. group_load(sg, p, out, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped16test_eleven_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm11EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META109:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META112:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 11 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM11ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_SN_NS0_4SPANISO_XT2_EEESQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP115:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm11ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_SN_NS0_4spanISO_XT2_EEESQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_eleven_ints(sycl::sub_group &sg, - plain_global_ptr p, +// CHECK-GLOBAL-LABEL: @_ZN7striped16test_eleven_intsERN4sycl3_V19sub_groupEPU3AS1iNS1_4spanIiLm11EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META117:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META120:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 11 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS1IILM11ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP123:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS1iiLm11ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR4]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped16test_eleven_intsERN4sycl3_V19sub_groupEPU3AS3iNS1_4spanIiLm11EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[OUT:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META118:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META121:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 11 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL10GROUP_LOADINS0_9SUB_GROUPEPU3AS3IILM11ENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE17VERIFY_LOAD_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_SP_NS0_4SPANISQ_XT2_EEESS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP4]], ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP124:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental10group_loadINS0_9sub_groupEPU3AS3iiLm11ENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE17verify_load_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_SP_NS0_4spanISQ_XT2_EEESS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_eleven_ints(sycl::sub_group &sg, plain_ptr p, span out) { // Non-power of two case bigger than max natively supported power of two case. group_load(sg, p, out, opt_striped{}); diff --git a/sycl/test/check_device_code/group_load_store_native_key.cpp b/sycl/test/check_device_code/group_load_store_native_key.cpp new file mode 100644 index 0000000000000..d9bac88ba8d71 --- /dev/null +++ b/sycl/test/check_device_code/group_load_store_native_key.cpp @@ -0,0 +1,179 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clangxx -DGLOBAL_SPACE -O3 -fsycl -fsycl-device-only -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck --check-prefix CHECK-GLOBAL %s +// RUN: %clangxx -DLOCAL_SPACE -O3 -fsycl -fsycl-device-only -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck --check-prefix CHECK-LOCAL %s + +// REQUIRES: linux + +// Test that in case of local address space, intrinsic is generated only if +// native_local_block_io property is set. + +#include + +using namespace sycl; + +namespace oneapi_exp = sycl::ext::oneapi::experimental; +using namespace sycl::ext::oneapi::experimental; + +using opt_blocked = + decltype(properties(full_group, contiguous_memory, data_placement_blocked)); +using opt_blocked_native = + decltype(properties(full_group, contiguous_memory, data_placement_blocked, + oneapi_exp::detail::native_local_block_io)); + +#ifdef GLOBAL_SPACE +template +using plain_ptr = typename sycl::detail::DecoratedType< + T, access::address_space::global_space>::type *; +#else +template +using plain_ptr = typename sycl::detail::DecoratedType< + T, access::address_space::local_space>::type *; +#endif + +// CHECK-GLOBAL-LABEL: @_Z9test_loadRN4sycl3_V19sub_groupEPU3AS1iRi( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR3:[0-9]+]] +// CHECK-GLOBAL-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_Z9test_loadRN4sycl3_V19sub_groupEPU3AS3iRi( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3:[0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[IDXPROM_I18_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I19_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i64 [[IDXPROM_I18_I_I]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[ARRAYIDX_I19_I_I]], align 4, !tbaa [[TBAA8:![0-9]+]] +// CHECK-LOCAL-NEXT: store i32 [[TMP1]], ptr addrspace(4) [[OUT:%.*]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_load(sycl::sub_group &sg, plain_ptr p, int &out) { + group_load(sg, p, out, opt_blocked{}); +} + +// CHECK-GLOBAL-LABEL: @_Z16test_load_nativeRN4sycl3_V19sub_groupEPU3AS1iRi( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS1Kj(ptr addrspace(1) noundef nonnull [[P]]) #[[ATTR3]] +// CHECK-GLOBAL-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_Z16test_load_nativeRN4sycl3_V19sub_groupEPU3AS3iRi( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-LOCAL-NEXT: [[CALL4_I_I:%.*]] = tail call spir_func noundef i32 @_Z30__spirv_SubgroupBlockReadINTELIjET_PU3AS3Kj(ptr addrspace(3) noundef nonnull [[P]]) #[[ATTR3]] +// CHECK-LOCAL-NEXT: store i32 [[CALL4_I_I]], ptr addrspace(4) [[OUT:%.*]], align 4 +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_load_native(sycl::sub_group &sg, plain_ptr p, + int &out) { + group_load(sg, p, out, opt_blocked_native{}); +} + +// CHECK-GLOBAL-LABEL: @_Z10test_storeRN4sycl3_V19sub_groupEiPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK-GLOBAL: if.then.i.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[V:%.*]], ptr addrspace(1) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA10:![0-9]+]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR3]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESP_RKSN_SO_SQ_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_Z10test_storeRN4sycl3_V19sub_groupEiPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK-LOCAL: if.then.i.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[V:%.*]], ptr addrspace(3) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT:%.*]] +// CHECK-LOCAL: if.end.i.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[IDXPROM_I18_I_I:%.*]] = sext i32 [[TMP2]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I19_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i64 [[IDXPROM_I18_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[V]], ptr addrspace(3) [[ARRAYIDX_I19_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESP_RKSN_SO_SQ_.exit: +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_store(sycl::sub_group &sg, int v, plain_ptr p) { + group_store(sg, v, p, opt_blocked{}); +} + +// CHECK-GLOBAL-LABEL: @_Z17test_store_nativeRN4sycl3_V19sub_groupEiPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK-GLOBAL: if.then.i.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[V:%.*]], ptr addrspace(1) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA10]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR3]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESR_RKSP_SQ_SS_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_Z17test_store_nativeRN4sycl3_V19sub_groupEiPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK-LOCAL: if.then.i.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[V:%.*]], ptr addrspace(3) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR3]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT:%.*]] +// CHECK-LOCAL: if.end.i.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS3jT_(ptr addrspace(3) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR3]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESR_RKSP_SQ_SS_.exit: +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_store_native(sycl::sub_group &sg, int v, + plain_ptr p) { + group_store(sg, v, p, opt_blocked_native{}); +} diff --git a/sycl/test/check_device_code/group_store.cpp b/sycl/test/check_device_code/group_store.cpp index 099353c36a2af..7e61b8de4e517 100644 --- a/sycl/test/check_device_code/group_store.cpp +++ b/sycl/test/check_device_code/group_store.cpp @@ -1,5 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clangxx -O3 -fsycl -fsycl-device-only -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck %s +// RUN: %clangxx -DGLOBAL_SPACE -O3 -fsycl -fsycl-device-only -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck --check-prefix CHECK-GLOBAL %s +// RUN: %clangxx -DLOCAL_SPACE -O3 -fsycl -fsycl-device-only -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -o - %s | FileCheck --check-prefix CHECK-LOCAL %s // Windows/linux have some slight differences in IR generation (function // arguments passing and long/long long differences/mangling) that could @@ -15,96 +16,161 @@ namespace oneapi_exp = sycl::ext::oneapi::experimental; using namespace sycl::ext::oneapi::experimental; using full_group_blocked = - decltype(properties(full_group, data_placement_blocked)); + decltype(properties(full_group, data_placement_blocked, + oneapi_exp::detail::native_local_block_io)); using naive_blocked = - decltype(properties(oneapi_exp::detail::naive, data_placement_blocked)); + decltype(properties(oneapi_exp::detail::naive, data_placement_blocked, + oneapi_exp::detail::native_local_block_io)); using opt_blocked = - decltype(properties(full_group, contiguous_memory, data_placement_blocked)); + decltype(properties(full_group, contiguous_memory, data_placement_blocked, + oneapi_exp::detail::native_local_block_io)); using full_group_striped = - decltype(properties(full_group, data_placement_striped)); + decltype(properties(full_group, data_placement_striped, + oneapi_exp::detail::native_local_block_io)); using naive_striped = - decltype(properties(oneapi_exp::detail::naive, data_placement_striped)); + decltype(properties(oneapi_exp::detail::naive, data_placement_striped, + oneapi_exp::detail::native_local_block_io)); using opt_striped = - decltype(properties(full_group, contiguous_memory, data_placement_striped)); + decltype(properties(full_group, contiguous_memory, data_placement_striped, + oneapi_exp::detail::native_local_block_io)); +#ifdef GLOBAL_SPACE template -using plain_global_ptr = typename sycl::detail::DecoratedType< +using plain_ptr = typename sycl::detail::DecoratedType< T, access::address_space::global_space>::type *; +#else +template +using plain_ptr = typename sycl::detail::DecoratedType< + T, access::address_space::local_space>::type *; +#endif namespace blocked { -// CHECK-LABEL: @_ZN7blocked10test_naiveERN4sycl3_V19sub_groupEiPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5:[0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8:![0-9]+]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: ret void -// -SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, int v, - plain_global_ptr p) { +// CHECK-GLOBAL-LABEL: @_ZN7blocked10test_naiveERN4sycl3_V19sub_groupEiPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7:[0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[V:%.*]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8:![0-9]+]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked10test_naiveERN4sycl3_V19sub_groupEiPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7:[0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[V:%.*]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8:![0-9]+]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: ret void +// +SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, int v, plain_ptr p) { // Ensure `detail::naive` always results in no block loads/stores. group_store(sg, v, p, naive_blocked{}); } -// CHECK-LABEL: @_ZN7blocked14test_optimizedERN4sycl3_V19sub_groupEiPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] -// CHECK: if.then.i.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I_I]] -// CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(1) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT:%.*]] -// CHECK: if.end.i.i: -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESP_RKSN_SO_SQ_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked14test_optimizedERN4sycl3_V19sub_groupEiPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK-GLOBAL: if.then.i.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[V:%.*]], ptr addrspace(1) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESR_RKSP_SQ_SS_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked14test_optimizedERN4sycl3_V19sub_groupEiPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK-LOCAL: if.then.i.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[V:%.*]], ptr addrspace(3) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT:%.*]] +// CHECK-LOCAL: if.end.i.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS3jT_(ptr addrspace(3) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESR_RKSP_SQ_SS__EXIT]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESR_RKSP_SQ_SS_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, int v, - plain_global_ptr p) { + plain_ptr p) { // Check that optimized implementation is selected. group_store(sg, v, p, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked27test_contiguous_auto_detectERN4sycl3_V19sub_groupEiPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] -// CHECK: if.then.i.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I_I]] -// CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(1) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESN_RKSL_SM_SO__EXIT:%.*]] -// CHECK: if.end.i.i: -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESN_RKSL_SM_SO__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESN_RKSL_SM_SO_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked27test_contiguous_auto_detectERN4sycl3_V19sub_groupEiPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK-GLOBAL: if.then.i.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[V:%.*]], ptr addrspace(1) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESP_RKSN_SO_SQ_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked27test_contiguous_auto_detectERN4sycl3_V19sub_groupEiPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP0]], 15 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I_I]], label [[IF_END_I_I:%.*]], label [[IF_THEN_I_I:%.*]] +// CHECK-LOCAL: if.then.i.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[V:%.*]], ptr addrspace(3) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT:%.*]] +// CHECK-LOCAL: if.end.i.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS3jT_(ptr addrspace(3) noundef nonnull [[P]], i32 noundef [[V]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEIPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESP_RKSN_SO_SQ__EXIT]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESP_RKSN_SO_SQ_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_contiguous_auto_detect(sycl::sub_group &sg, int v, - plain_global_ptr p) { + plain_ptr p) { // Check that contiguous_memory can be auto-detected. group_store(sg, v, p, full_group_blocked{}); } @@ -113,20 +179,33 @@ SYCL_EXTERNAL void test_contiguous_auto_detect(sycl::sub_group &sg, int v, using accessor_iter_t = accessor::iterator; +// CHECK-GLOBAL-LABEL: @_ZN7blocked18test_accessor_iterERN4sycl3_V19sub_groupEiRNS1_6detail17accessor_iteratorIiLi1EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18:![0-9]+]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-GLOBAL-NEXT: [[CONV5_I_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-GLOBAL-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP1]], i64 [[CONV5_I_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[V:%.*]], ptr addrspace(4) [[ADD_PTR_I_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: ret void // -// CHECK-LABEL: @_ZN7blocked18test_accessor_iterERN4sycl3_V19sub_groupEiRNS1_6detail17accessor_iteratorIiLi1EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15:![0-9]+]] -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18:![0-9]+]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[CONV5_I_I_I:%.*]] = sext i32 [[TMP0]] to i64 -// CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP1]], i64 [[CONV5_I_I_I]] -// CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(4) [[ADD_PTR_I_I_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: ret void +// CHECK-LOCAL-LABEL: @_ZN7blocked18test_accessor_iterERN4sycl3_V19sub_groupEiRNS1_6detail17accessor_iteratorIiLi1EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15:![0-9]+]] +// CHECK-LOCAL-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-LOCAL-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18:![0-9]+]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 +// CHECK-LOCAL-NEXT: [[CONV5_I_I_I:%.*]] = sext i32 [[TMP0]] to i64 +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] +// CHECK-LOCAL-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP1]], i64 [[CONV5_I_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[V:%.*]], ptr addrspace(4) [[ADD_PTR_I_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, int v, accessor_iter_t &iter) { @@ -134,35 +213,49 @@ SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, int v, group_store(sg, v, iter, full_group_blocked{}); } -// CHECK-LABEL: @_ZN7blocked34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupEiRNS1_6detail17accessor_iteratorIiLi1EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 -// CHECK-NEXT: [[AGG_TMP1_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP1_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] -// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP1_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP1_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[CMP_I_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I_I]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I_I]]) -// CHECK-NEXT: [[TMP0:%.*]] = ptrtoint ptr addrspace(4) [[ADD_PTR_I_I_I_I]] to i64 -// CHECK-NEXT: [[REM_I_I_I_I:%.*]] = and i64 [[TMP0]], 15 -// CHECK-NEXT: [[CMP1_I_I_I_I:%.*]] = icmp eq i64 [[REM_I_I_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_I_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL6DETAIL16GET_BLOCK_OP_PTRILI16ELM1ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS4_20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEEDAT1_T2__EXIT_I_I:%.*]], label [[IF_THEN_I_I:%.*]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental6detail16get_block_op_ptrILi16ELm1ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS4_20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEEDaT1_T2_.exit.i.i: -// CHECK-NEXT: [[CALL_I_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I_I]], i32 noundef 5) #[[ATTR6:[0-9]+]] -// CHECK-NEXT: [[TOBOOL_NOT_I_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I_I_I]], null -// CHECK-NEXT: br i1 [[TOBOOL_NOT_I_I]], label [[IF_THEN_I_I]], label [[IF_END_I_I:%.*]] -// CHECK: if.then.i.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4 -// CHECK-NEXT: [[CONV5_I_I_I:%.*]] = sext i32 [[TMP1]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I_I]], i64 [[CONV5_I_I_I]] -// CHECK-NEXT: store i32 [[V:%.*]], ptr addrspace(4) [[ADD_PTR_I_I_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEINS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESQ_RKSO_SP_SR__EXIT:%.*]] -// CHECK: if.end.i.i: -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIjEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[CALL_I_I_I_I_I]], i32 noundef [[V]]) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEINS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T1_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT2_EEVE4TYPEESQ_RKSO_SP_SR__EXIT]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiNS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T1_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT2_EEvE4typeESQ_RKSO_SP_SR_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupEiRNS1_6detail17accessor_iteratorIiLi1EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[AGG_TMP_I:%.*]] = alloca %"struct.sycl::_V1::sub_group", align 1 +// CHECK-GLOBAL-NEXT: [[AGG_TMP1_I:%.*]] = alloca %"class.sycl::_V1::span", align 8 +// CHECK-GLOBAL-NEXT: [[AGG_TMP3_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::experimental::properties.7", align 1 +// CHECK-GLOBAL-NEXT: [[AGG_TMP14:%.*]] = alloca %"class.sycl::_V1::detail::accessor_iterator", align 8 +// CHECK-GLOBAL-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 +// CHECK-GLOBAL-NEXT: [[V_ADDR_ASCAST:%.*]] = addrspacecast ptr [[V_ADDR]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: store i32 [[V:%.*]], ptr [[V_ADDR]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.start.p0(i64 80, ptr nonnull [[AGG_TMP14]]) +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr nonnull [[AGG_TMP_I]]) +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP1_I]]) +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[AGG_TMP3_I]]) +// CHECK-GLOBAL-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr noundef nonnull align 8 dereferenceable(80) [[AGG_TMP14]], ptr addrspace(4) noundef align 8 dereferenceable(80) [[ITER:%.*]], i64 80, i1 false) +// CHECK-GLOBAL-NEXT: store ptr addrspace(4) [[V_ADDR_ASCAST]], ptr [[AGG_TMP1_I]], align 8, !tbaa [[TBAA21:![0-9]+]] +// CHECK-GLOBAL-NEXT: call spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKiLm1ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEENSD_INSB_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_NS0_4spanISR_XT1_EEESS_SU_(ptr noundef nonnull byval(%"struct.sycl::_V1::sub_group") align 1 [[AGG_TMP_I]], ptr noundef nonnull byval(%"class.sycl::_V1::span") align 8 [[AGG_TMP1_I]], ptr noundef nonnull byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[AGG_TMP14]], ptr noundef nonnull byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.7") align 1 [[AGG_TMP3_I]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 80, ptr nonnull [[AGG_TMP14]]) +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr nonnull [[AGG_TMP_I]]) +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP1_I]]) +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[AGG_TMP3_I]]) +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupEiRNS1_6detail17accessor_iteratorIiLi1EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[AGG_TMP_I:%.*]] = alloca %"struct.sycl::_V1::sub_group", align 1 +// CHECK-LOCAL-NEXT: [[AGG_TMP1_I:%.*]] = alloca %"class.sycl::_V1::span", align 8 +// CHECK-LOCAL-NEXT: [[AGG_TMP3_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::experimental::properties.7", align 1 +// CHECK-LOCAL-NEXT: [[AGG_TMP14:%.*]] = alloca %"class.sycl::_V1::detail::accessor_iterator", align 8 +// CHECK-LOCAL-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 +// CHECK-LOCAL-NEXT: [[V_ADDR_ASCAST:%.*]] = addrspacecast ptr [[V_ADDR]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: store i32 [[V:%.*]], ptr [[V_ADDR]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.start.p0(i64 80, ptr nonnull [[AGG_TMP14]]) +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr nonnull [[AGG_TMP_I]]) +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP1_I]]) +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[AGG_TMP3_I]]) +// CHECK-LOCAL-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr noundef nonnull align 8 dereferenceable(80) [[AGG_TMP14]], ptr addrspace(4) noundef align 8 dereferenceable(80) [[ITER:%.*]], i64 80, i1 false) +// CHECK-LOCAL-NEXT: store ptr addrspace(4) [[V_ADDR_ASCAST]], ptr [[AGG_TMP1_I]], align 8, !tbaa [[TBAA21:![0-9]+]] +// CHECK-LOCAL-NEXT: call spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKiLm1ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSD_INS3_21contiguous_memory_keyEJEEENSD_INS3_14full_group_keyEJEEENSD_INSB_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_NS0_4spanISR_XT1_EEESS_SU_(ptr noundef nonnull byval(%"struct.sycl::_V1::sub_group") align 1 [[AGG_TMP_I]], ptr noundef nonnull byval(%"class.sycl::_V1::span") align 8 [[AGG_TMP1_I]], ptr noundef nonnull byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[AGG_TMP14]], ptr noundef nonnull byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.7") align 1 [[AGG_TMP3_I]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 80, ptr nonnull [[AGG_TMP14]]) +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr nonnull [[AGG_TMP_I]]) +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP1_I]]) +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[AGG_TMP3_I]]) +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, int v, @@ -171,390 +264,718 @@ SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, group_store(sg, v, iter, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked16test_four_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm4EEEPU3AS1s( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA22:![0-9]+]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META24:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA27:![0-9]+]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP29:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7:[0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] -// CHECK: for.cond.cleanup.i: -// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I]], align 2, !tbaa [[TBAA31:![0-9]+]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS1mT_(ptr addrspace(1) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] -// CHECK-NEXT: store i16 [[TMP6]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP32:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked16test_four_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm4EEEPU3AS1s( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA25:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-GLOBAL: if.then.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META27:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_NS0_4SPANISR_XT1_EEESS_SU__EXIT_I:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA30:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP32:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_NS0_4spanISR_XT1_EEESS_SU_.exit.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i: +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9:[0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-GLOBAL: for.cond.i: +// CHECK-GLOBAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-GLOBAL: for.cond.cleanup.i: +// CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I]], align 2, !tbaa [[TBAA34:![0-9]+]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS1mT_(ptr addrspace(1) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] +// CHECK-GLOBAL: for.body.i: +// CHECK-GLOBAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: store i16 [[TMP6]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP35:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked16test_four_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm4EEEPU3AS3s( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA25:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-LOCAL: if.then.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META27:![0-9]+]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_NS0_4SPANISR_XT1_EEESS_SU__EXIT_I:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA30:![0-9]+]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i16 [[TMP4]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP32:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS3sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_NS0_4spanISR_XT1_EEESS_SU_.exit.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-LOCAL: if.end.i: +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9:[0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-LOCAL: for.cond.i: +// CHECK-LOCAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-LOCAL: for.cond.cleanup.i: +// CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I]], align 2, !tbaa [[TBAA34:![0-9]+]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS3mT_(ptr addrspace(3) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] +// CHECK-LOCAL: for.body.i: +// CHECK-LOCAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: store i16 [[TMP6]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP35:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS3sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Four shorts in blocked data layout could be stored as a single 64-bit // integer. group_store(sg, v, p, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked22test_four_const_shortsERN4sycl3_V19sub_groupENS1_4spanIKsLm4EEEPU3AS1s( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA22]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META34:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_ST__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP37:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESS_NS0_4spanISQ_XT1_EEESR_ST_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_SR__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] -// CHECK: for.cond.cleanup.i: -// CHECK-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I]], align 2, !tbaa [[TBAA31]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS1mT_(ptr addrspace(1) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_SR__EXIT]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] -// CHECK-NEXT: store i16 [[TMP6]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP38:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESQ_NS0_4spanISO_XT1_EEESP_SR_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked22test_four_const_shortsERN4sycl3_V19sub_groupENS1_4spanIKsLm4EEEPU3AS1s( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA25]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-GLOBAL: if.then.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META37:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_9NAIVE_KEYEJEEENSC_INSA_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESU_NS0_4SPANISS_XT1_EEEST_SV__EXIT_I:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i16 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP40:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_9naive_keyEJEEENSC_INSA_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESU_NS0_4spanISS_XT1_EEEST_SV_.exit.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_ST__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i: +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-GLOBAL: for.cond.i: +// CHECK-GLOBAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-GLOBAL: for.cond.cleanup.i: +// CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I]], align 2, !tbaa [[TBAA34]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS1mT_(ptr addrspace(1) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_ST__EXIT]] +// CHECK-GLOBAL: for.body.i: +// CHECK-GLOBAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: store i16 [[TMP6]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP41:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESS_NS0_4spanISQ_XT1_EEESR_ST_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked22test_four_const_shortsERN4sycl3_V19sub_groupENS1_4spanIKsLm4EEEPU3AS3s( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA25]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-LOCAL: if.then.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META37:![0-9]+]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP3]], 2 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_9NAIVE_KEYEJEEENSC_INSA_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESU_NS0_4SPANISS_XT1_EEEST_SV__EXIT_I:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i16 [[TMP4]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP40:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKsLm4EPU3AS3sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_9naive_keyEJEEENSC_INSA_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESU_NS0_4spanISS_XT1_EEEST_SV_.exit.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_ST__EXIT:%.*]] +// CHECK-LOCAL: if.end.i: +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-LOCAL: for.cond.i: +// CHECK-LOCAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-LOCAL: for.cond.cleanup.i: +// CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i64, ptr [[VALUES_I]], align 2, !tbaa [[TBAA34]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELImEvPU3AS3mT_(ptr addrspace(3) noundef nonnull [[P]], i64 noundef [[TMP5]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEKSLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_ST__EXIT]] +// CHECK-LOCAL: for.body.i: +// CHECK-LOCAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: [[TMP6:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: store i16 [[TMP6]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP41:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEKsLm4EPU3AS3sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESS_NS0_4spanISQ_XT1_EEESR_ST_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_four_const_shorts(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Same, but make it `const short`. group_store(sg, v, p, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked21test_non_power_of_twoERN4sycl3_V19sub_groupENS1_4spanIiLm3EEEPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META40:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 3 -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM3EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP43:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked21test_non_power_of_twoERN4sycl3_V19sub_groupENS1_4spanIiLm3EEEPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META43:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 3 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM3EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP46:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked21test_non_power_of_twoERN4sycl3_V19sub_groupENS1_4spanIiLm3EEEPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META43:![0-9]+]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 3 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM3EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP3]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP46:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_non_power_of_two(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Check for non-power-of-two size. group_store(sg, v, p, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked14test_four_intsERN4sycl3_V19sub_groupENS1_4spanIiLm4EEEPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META45:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP2]], 2 -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM4EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP48:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm4EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked14test_four_intsERN4sycl3_V19sub_groupENS1_4spanIiLm4EEEPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META48:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP2]], 2 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM4EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP51:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm4EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked14test_four_intsERN4sycl3_V19sub_groupENS1_4spanIiLm4EEEPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META48:![0-9]+]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = shl i32 [[TMP2]], 2 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM4EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = or disjoint i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP3]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP51:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm4EPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_four_ints(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Four int elements in blocked data layout don't map directly to any // BlockWrite API. group_store(sg, v, p, opt_blocked{}); } -// CHECK-LABEL: @_ZN7blocked15test_seven_intsERN4sycl3_V19sub_groupENS1_4spanIiLm7EEEPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META50:![0-9]+]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 7 -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 7 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM7EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP53:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm7EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7blocked15test_seven_intsERN4sycl3_V19sub_groupENS1_4spanIiLm7EEEPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META53:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 7 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 7 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM7EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP3]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP56:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm7EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7blocked15test_seven_intsERN4sycl3_V19sub_groupENS1_4spanIiLm7EEEPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META53:![0-9]+]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP2]], 7 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 7 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM7EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI0EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[MUL_I_I_I]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP3]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP56:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm7EPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi0EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_seven_ints(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Similar to four elements case but more complex to optimize. group_store(sg, v, p, opt_blocked{}); } } // namespace blocked namespace striped { -// Striped data layout with one element per work item isn't different from -// blocked data layout, so use span version only in the checks below. - -// CHECK-LABEL: @_ZN7striped10test_naiveERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META55:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META58:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESN_NS0_4SPANISL_XT1_EEESM_SO__EXIT:%.*]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] -// CHECK-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] -// CHECK-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP61:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESN_NS0_4spanISL_XT1_EEESM_SO_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped10test_naiveERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META58:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META61:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-GLOBAL: for.cond.i: +// CHECK-GLOBAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i: +// CHECK-GLOBAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP64:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped10test_naiveERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META58:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META61:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-LOCAL: for.cond.i: +// CHECK-LOCAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] +// CHECK-LOCAL: for.body.i: +// CHECK-LOCAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[MUL_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I:%.*]] = sext i32 [[ADD_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP4]], ptr addrspace(3) [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP64:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_naive(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Ensure `detail::naive` always results in no block loads/stores. group_store(sg, v, p, naive_striped{}); } -// CHECK-LABEL: @_ZN7striped14test_optimizedERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [2 x i32], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META63:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META66:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP69:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] -// CHECK: for.cond.cleanup.i: -// CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I]], align 4, !tbaa [[TBAA31]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP70:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped14test_optimizedERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[VALUES_I:%.*]] = alloca [2 x i32], align 4 +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-GLOBAL: if.then.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META66:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META69:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_NS0_4SPANISR_XT1_EEESS_SU__EXIT_I:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP72:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_NS0_4spanISR_XT1_EEESS_SU_.exit.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i: +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-GLOBAL: for.cond.i: +// CHECK-GLOBAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-GLOBAL: for.cond.cleanup.i: +// CHECK-GLOBAL-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I]], align 4, !tbaa [[TBAA34]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] +// CHECK-GLOBAL: for.body.i: +// CHECK-GLOBAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP73:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped14test_optimizedERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[VALUES_I:%.*]] = alloca [2 x i32], align 4 +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-LOCAL: if.then.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META66:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META69:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_NS0_4SPANISR_XT1_EEESS_SU__EXIT_I:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP5]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP72:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_NS0_4spanISR_XT1_EEESS_SU_.exit.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-LOCAL: if.end.i: +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-LOCAL: for.cond.i: +// CHECK-LOCAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-LOCAL: for.cond.cleanup.i: +// CHECK-LOCAL-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I]], align 4, !tbaa [[TBAA34]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS3jT_(ptr addrspace(3) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] +// CHECK-LOCAL: for.body.i: +// CHECK-LOCAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP73:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_optimized(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Check that optimized implementation is selected. group_store(sg, v, p, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped27test_contiguous_auto_detectERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [2 x i32], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META72:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META75:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP78:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESN_NS0_4SPANISL_XT1_EEESM_SO__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] -// CHECK: for.cond.cleanup.i: -// CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I]], align 4, !tbaa [[TBAA31]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESN_NS0_4SPANISL_XT1_EEESM_SO__EXIT]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP79:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESN_NS0_4spanISL_XT1_EEESM_SO_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped27test_contiguous_auto_detectERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[VALUES_I:%.*]] = alloca [2 x i32], align 4 +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-GLOBAL: if.then.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META75:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META78:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT_I:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP81:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i: +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-GLOBAL: for.cond.i: +// CHECK-GLOBAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-GLOBAL: for.cond.cleanup.i: +// CHECK-GLOBAL-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I]], align 4, !tbaa [[TBAA34]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT]] +// CHECK-GLOBAL: for.body.i: +// CHECK-GLOBAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP82:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped27test_contiguous_auto_detectERN4sycl3_V19sub_groupENS1_4spanIiLm2EEEPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[VALUES_I:%.*]] = alloca [2 x i32], align 4 +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-LOCAL: if.then.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META75:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META78:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT_I:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP5]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP81:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] +// CHECK-LOCAL: if.end.i: +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-LOCAL: for.cond.i: +// CHECK-LOCAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-LOCAL: for.cond.cleanup.i: +// CHECK-LOCAL-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I]], align 4, !tbaa [[TBAA34]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS3jT_(ptr addrspace(3) noundef nonnull [[P]], <2 x i32> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT]] +// CHECK-LOCAL: for.body.i: +// CHECK-LOCAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP82:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2EPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_contiguous_auto_detect(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Check that contiguous_memory can be auto-detected. group_store(sg, v, p, full_group_striped{}); } @@ -563,36 +984,67 @@ SYCL_EXTERNAL void test_contiguous_auto_detect(sycl::sub_group &sg, using accessor_iter_t = accessor::iterator; -// CHECK-LABEL: @_ZN7striped18test_accessor_iterERN4sycl3_V19sub_groupENS1_4spanIiLm2EEERNS1_6detail17accessor_iteratorIiLi1EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 -// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META81:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META84:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP2_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP2_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESO_NS0_4SPANISM_XT1_EEESN_SP__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] -// CHECK-NEXT: [[CONV5_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV5_I_I]] -// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP87:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESO_NS0_4spanISM_XT1_EEESN_SP_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped18test_accessor_iterERN4sycl3_V19sub_groupENS1_4spanIiLm2EEERNS1_6detail17accessor_iteratorIiLi1EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-GLOBAL-NEXT: [[AGG_TMP2_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META84:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META87:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP2_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP2_SROA_2_0_COPYLOAD]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_SR__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[CONV5_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV5_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP90:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_14full_group_keyEJEEENSC_INSA_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESQ_NS0_4spanISO_XT1_EEESP_SR_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped18test_accessor_iterERN4sycl3_V19sub_groupENS1_4spanIiLm2EEERNS1_6detail17accessor_iteratorIiLi1EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 +// CHECK-LOCAL-NEXT: [[AGG_TMP2_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META84:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META87:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr addrspace(4) [[AGG_TMP2_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP2_SROA_2_0_COPYLOAD]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_SR__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP3]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[CONV5_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[TMP4]], i64 [[CONV5_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP90:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_14full_group_keyEJEEENSC_INSA_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESQ_NS0_4spanISO_XT1_EEESP_SR_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, span v, accessor_iter_t &iter) { @@ -600,70 +1052,29 @@ SYCL_EXTERNAL void test_accessor_iter(sycl::sub_group &sg, span v, group_store(sg, v, iter, full_group_striped{}); } -// CHECK-LABEL: @_ZN7striped34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupENS1_4spanIiLm2EEERNS1_6detail17accessor_iteratorIiLi1EEE( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [2 x i32], align 4 -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ITER:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[ITER]], i64 8 -// CHECK-NEXT: [[AGG_TMP2_SROA_2_0_COPYLOAD:%.*]] = load i64, ptr addrspace(4) [[AGG_TMP2_SROA_2_0_ITER_SROA_IDX]], align 8, !tbaa [[TBAA18]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[ADD_PTR_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[AGG_TMP2_SROA_0_0_COPYLOAD]], i64 [[AGG_TMP2_SROA_2_0_COPYLOAD]] -// CHECK-NEXT: [[CMP_I_I_I:%.*]] = icmp ne ptr addrspace(4) [[ADD_PTR_I_I_I]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(4) [[ADD_PTR_I_I_I]] to i64 -// CHECK-NEXT: [[REM_I_I_I:%.*]] = and i64 [[TMP2]], 15 -// CHECK-NEXT: [[CMP1_I_I_I:%.*]] = icmp eq i64 [[REM_I_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL6DETAIL16GET_BLOCK_OP_PTRILI16ELM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS4_20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEEDAT1_T2__EXIT_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental6detail16get_block_op_ptrILi16ELm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS4_20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEEDaT1_T2_.exit.i: -// CHECK-NEXT: [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef nonnull [[ADD_PTR_I_I_I]], i32 noundef 5) #[[ATTR6]] -// CHECK-NEXT: [[TOBOOL_NOT_I:%.*]] = icmp eq ptr addrspace(1) [[CALL_I_I_I_I]], null -// CHECK-NEXT: br i1 [[TOBOOL_NOT_I]], label [[IF_THEN_I]], label [[IF_END_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META89:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META92:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEENSC_INSA_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESS_NS0_4SPANISQ_XT1_EEESR_ST__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul nuw nsw i32 [[TMP4]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] -// CHECK-NEXT: [[CONV5_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ADD_PTR_I_I_I_I:%.*]] = getelementptr i32, ptr addrspace(4) [[ADD_PTR_I_I_I]], i64 [[CONV5_I_I]] -// CHECK-NEXT: store i32 [[TMP5]], ptr addrspace(4) [[ADD_PTR_I_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP95:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESS_NS0_4spanISQ_XT1_EEESR_ST_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_SR__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 2 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] -// CHECK: for.cond.cleanup.i: -// CHECK-NEXT: [[TMP6:%.*]] = load <2 x i32>, ptr [[VALUES_I]], align 4, !tbaa [[TBAA31]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv2_jEvPU3AS1jT_(ptr addrspace(1) noundef nonnull [[CALL_I_I_I_I]], <2 x i32> noundef [[TMP6]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM2ENS0_6DETAIL17ACCESSOR_ITERATORIILI1EEENS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSC_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSC_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESQ_NS0_4SPANISO_XT1_EEESP_SR__EXIT]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [2 x i32], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] -// CHECK-NEXT: store i32 [[TMP7]], ptr [[ARRAYIDX_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP96:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESQ_NS0_4spanISO_XT1_EEESP_SR_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupENS1_4spanIiLm2EEERNS1_6detail17accessor_iteratorIiLi1EEE( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[AGG_TMP:%.*]] = alloca %"struct.sycl::_V1::sub_group", align 1 +// CHECK-GLOBAL-NEXT: [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::span.22", align 8 +// CHECK-GLOBAL-NEXT: [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::detail::accessor_iterator", align 8 +// CHECK-GLOBAL-NEXT: [[AGG_TMP3:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::experimental::properties.28", align 1 +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: store i64 [[TMP0]], ptr [[AGG_TMP1]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[AGG_TMP2]], ptr addrspace(4) align 8 [[ITER:%.*]], i64 80, i1 false), !tbaa.struct [[TBAA_STRUCT92:![0-9]+]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESS_NS0_4spanISQ_XT1_EEESR_ST_(ptr noundef nonnull byval(%"struct.sycl::_V1::sub_group") align 1 [[AGG_TMP]], ptr noundef nonnull byval(%"class.sycl::_V1::span.22") align 8 [[AGG_TMP1]], ptr noundef nonnull byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[AGG_TMP2]], ptr noundef nonnull byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.28") align 1 [[AGG_TMP3]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped34test_accessor_iter_force_optimizedERN4sycl3_V19sub_groupENS1_4spanIiLm2EEERNS1_6detail17accessor_iteratorIiLi1EEE( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[AGG_TMP:%.*]] = alloca %"struct.sycl::_V1::sub_group", align 1 +// CHECK-LOCAL-NEXT: [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::span.22", align 8 +// CHECK-LOCAL-NEXT: [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::detail::accessor_iterator", align 8 +// CHECK-LOCAL-NEXT: [[AGG_TMP3:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::experimental::properties.28", align 1 +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: store i64 [[TMP0]], ptr [[AGG_TMP1]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: call void @llvm.memcpy.p0.p4.i64(ptr align 8 [[AGG_TMP2]], ptr addrspace(4) align 8 [[ITER:%.*]], i64 80, i1 false), !tbaa.struct [[TBAA_STRUCT92:![0-9]+]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm2ENS0_6detail17accessor_iteratorIiLi1EEENS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSC_INS3_21contiguous_memory_keyEJEEENSC_INS3_14full_group_keyEJEEENSC_INSA_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESS_NS0_4spanISQ_XT1_EEESR_ST_(ptr noundef nonnull byval(%"struct.sycl::_V1::sub_group") align 1 [[AGG_TMP]], ptr noundef nonnull byval(%"class.sycl::_V1::span.22") align 8 [[AGG_TMP1]], ptr noundef nonnull byval(%"class.sycl::_V1::detail::accessor_iterator") align 8 [[AGG_TMP2]], ptr noundef nonnull byval(%"class.sycl::_V1::ext::oneapi::experimental::properties.28") align 1 [[AGG_TMP3]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, span v, @@ -672,228 +1083,423 @@ SYCL_EXTERNAL void test_accessor_iter_force_optimized(sycl::sub_group &sg, group_store(sg, v, iter, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped16test_four_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm4EEEPU3AS1s( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA22]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META98:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META101:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP104:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] -// CHECK: for.cond.cleanup.i: -// CHECK-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[VALUES_I]], align 2, !tbaa [[TBAA31]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv4_tEvPU3AS1tT_(ptr addrspace(1) noundef nonnull [[P]], <4 x i16> noundef [[TMP6]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] -// CHECK-NEXT: store i16 [[TMP7]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP105:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped16test_four_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm4EEEPU3AS1s( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA25]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-GLOBAL: if.then.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META112:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META115:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_NS0_4SPANISR_XT1_EEESS_SU__EXIT_I:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP118:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_NS0_4spanISR_XT1_EEESS_SU_.exit.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i: +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-GLOBAL: for.cond.i: +// CHECK-GLOBAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-GLOBAL: for.cond.cleanup.i: +// CHECK-GLOBAL-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[VALUES_I]], align 2, !tbaa [[TBAA34]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv4_tEvPU3AS1tT_(ptr addrspace(1) noundef nonnull [[P]], <4 x i16> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] +// CHECK-GLOBAL: for.body.i: +// CHECK-GLOBAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: store i16 [[TMP7]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP119:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped16test_four_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm4EEEPU3AS3s( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[VALUES_I:%.*]] = alloca [4 x i16], align 2 +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA25]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-LOCAL: if.then.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META112:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META115:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 4 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_NS0_4SPANISR_XT1_EEESS_SU__EXIT_I:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i16 [[TMP5]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP118:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS3sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_NS0_4spanISR_XT1_EEESS_SU_.exit.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-LOCAL: if.end.i: +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-LOCAL: for.cond.i: +// CHECK-LOCAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 4 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-LOCAL: for.cond.cleanup.i: +// CHECK-LOCAL-NEXT: [[TMP6:%.*]] = load <4 x i16>, ptr [[VALUES_I]], align 2, !tbaa [[TBAA34]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv4_tEvPU3AS3tT_(ptr addrspace(3) noundef nonnull [[P]], <4 x i16> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM4EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] +// CHECK-LOCAL: for.body.i: +// CHECK-LOCAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [4 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: store i16 [[TMP7]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP119:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm4EPU3AS3sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_four_shorts(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Just because there is a blocked data layout testcase, nothing inherently // useful here. group_store(sg, v, p, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped19test_sixteen_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm16EEEPU3AS1s( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[VALUES_I:%.*]] = alloca [16 x i16], align 2 -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA22]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null -// CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) -// CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 -// CHECK-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 -// CHECK-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 -// CHECK-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] -// CHECK: if.then.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META107:![0-9]+]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META110:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 -// CHECK-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT_I:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP113:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm16EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit.i: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] -// CHECK: if.end.i: -// CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[FOR_COND_I:%.*]] -// CHECK: for.cond.i: -// CHECK-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] -// CHECK-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 16 -// CHECK-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] -// CHECK: for.cond.cleanup.i: -// CHECK-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr [[VALUES_I]], align 2, !tbaa [[TBAA31]] -// CHECK-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv16_tEvPU3AS1tT_(ptr addrspace(1) noundef nonnull [[P]], <16 x i16> noundef [[TMP6]]) #[[ATTR5]] -// CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VALUES_I]]) #[[ATTR7]] -// CHECK-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT]] -// CHECK: for.body.i: -// CHECK-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] -// CHECK-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [16 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] -// CHECK-NEXT: store i16 [[TMP7]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA27]] -// CHECK-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP114:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm16EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped19test_sixteen_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm16EEEPU3AS1s( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[VALUES_I:%.*]] = alloca [16 x i16], align 2 +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA25]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(1) [[P:%.*]], null +// CHECK-GLOBAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64 +// CHECK-GLOBAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-GLOBAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-GLOBAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-GLOBAL: if.then.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META121:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META124:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_NS0_4SPANISR_XT1_EEESS_SU__EXIT_I:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(1) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i16 [[TMP5]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP127:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm16EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_NS0_4spanISR_XT1_EEESS_SU_.exit.i: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: if.end.i: +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-GLOBAL: for.cond.i: +// CHECK-GLOBAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 16 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-GLOBAL: for.cond.cleanup.i: +// CHECK-GLOBAL-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr [[VALUES_I]], align 2, !tbaa [[TBAA34]] +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv16_tEvPU3AS1tT_(ptr addrspace(1) noundef nonnull [[P]], <16 x i16> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-GLOBAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS1SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] +// CHECK-GLOBAL: for.body.i: +// CHECK-GLOBAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [16 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-GLOBAL-NEXT: store i16 [[TMP7]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA30]] +// CHECK-GLOBAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP128:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm16EPU3AS1sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped19test_sixteen_shortsERN4sycl3_V19sub_groupENS1_4spanIsLm16EEEPU3AS3s( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[VALUES_I:%.*]] = alloca [16 x i16], align 2 +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA25]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp ne ptr addrspace(3) [[P:%.*]], null +// CHECK-LOCAL-NEXT: tail call void @llvm.assume(i1 [[CMP_I_I]]) +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = ptrtoint ptr addrspace(3) [[P]] to i64 +// CHECK-LOCAL-NEXT: [[REM_I_I:%.*]] = and i64 [[TMP2]], 15 +// CHECK-LOCAL-NEXT: [[CMP1_I_NOT_I:%.*]] = icmp eq i64 [[REM_I_I]], 0 +// CHECK-LOCAL-NEXT: br i1 [[CMP1_I_NOT_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]] +// CHECK-LOCAL: if.then.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META121:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META124:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[IF_THEN_I]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I19_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I19_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_9NAIVE_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEEST_NS0_4SPANISR_XT1_EEESS_SU__EXIT_I:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP5:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP4]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP3]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(3) [[P]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i16 [[TMP5]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP127:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm16EPU3AS3sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_9naive_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeEST_NS0_4spanISR_XT1_EEESS_SU_.exit.i: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-LOCAL: if.end.i: +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I:%.*]] +// CHECK-LOCAL: for.cond.i: +// CHECK-LOCAL-NEXT: [[I_0_I:%.*]] = phi i32 [ 0, [[IF_END_I]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I:%.*]] = icmp samesign ult i32 [[I_0_I]], 16 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[FOR_COND_CLEANUP_I:%.*]] +// CHECK-LOCAL: for.cond.cleanup.i: +// CHECK-LOCAL-NEXT: [[TMP6:%.*]] = load <16 x i16>, ptr [[VALUES_I]], align 2, !tbaa [[TBAA34]] +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z31__spirv_SubgroupBlockWriteINTELIDv16_tEvPU3AS3tT_(ptr addrspace(3) noundef nonnull [[P]], <16 x i16> noundef [[TMP6]]) #[[ATTR7]] +// CHECK-LOCAL-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[VALUES_I]]) #[[ATTR9]] +// CHECK-LOCAL-NEXT: br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPESLM16EPU3AS3SNS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT]] +// CHECK-LOCAL: for.body.i: +// CHECK-LOCAL-NEXT: [[CONV_I:%.*]] = zext nneg i32 [[I_0_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I20_I:%.*]] = getelementptr inbounds nuw i16, ptr addrspace(4) [[TMP1]], i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: [[TMP7:%.*]] = load i16, ptr addrspace(4) [[ARRAYIDX_I20_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I:%.*]] = getelementptr inbounds [16 x i16], ptr [[VALUES_I]], i64 0, i64 [[CONV_I]] +// CHECK-LOCAL-NEXT: store i16 [[TMP7]], ptr [[ARRAYIDX_I]], align 2, !tbaa [[TBAA30]] +// CHECK-LOCAL-NEXT: [[INC_I]] = add nuw nsw i32 [[I_0_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I]], !llvm.loop [[LOOP128:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEsLm16EPU3AS3sNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_sixteen_shorts(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { group_store(sg, v, p, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped21test_non_power_of_twoERN4sycl3_V19sub_groupENS1_4spanIiLm3EEEPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META107:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META110:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM3EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP113:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped21test_non_power_of_twoERN4sycl3_V19sub_groupENS1_4spanIiLm3EEEPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META130:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META133:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM3EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP136:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped21test_non_power_of_twoERN4sycl3_V19sub_groupENS1_4spanIiLm3EEEPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META130:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META133:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 3 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM3EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP4]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP136:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm3EPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_non_power_of_two(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Check for non-power-of-two size. group_store(sg, v, p, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped17test_sixteen_intsERN4sycl3_V19sub_groupENS1_4spanIiLm16EEEPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META115:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META118:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM16EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP121:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm16EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped17test_sixteen_intsERN4sycl3_V19sub_groupENS1_4spanIiLm16EEEPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META138:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META141:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM16EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP144:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm16EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped17test_sixteen_intsERN4sycl3_V19sub_groupENS1_4spanIiLm16EEEPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META138:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META141:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 16 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM16EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP4]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP144:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm16EPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_sixteen_ints(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Even though power of two, still too many to map directly onto BloadWrite // API. group_store(sg, v, p, opt_striped{}); } -// CHECK-LABEL: @_ZN7striped16test_eleven_intsERN4sycl3_V19sub_groupENS1_4spanIiLm11EEEPU3AS1i( -// CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] -// CHECK-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META123:![0-9]+]] -// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META126:![0-9]+]] -// CHECK-NEXT: br label [[FOR_COND_I_I:%.*]] -// CHECK: for.cond.i.i: -// CHECK-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] -// CHECK-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 11 -// CHECK-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM11EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESP_NS0_4SPANISN_XT1_EEESO_SQ__EXIT:%.*]] -// CHECK: for.body.i.i: -// CHECK-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] -// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] -// CHECK-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] -// CHECK-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 -// CHECK-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] -// CHECK-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] -// CHECK-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 -// CHECK-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP129:![0-9]+]] -// CHECK: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm11EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESP_NS0_4spanISN_XT1_EEESO_SQ_.exit: -// CHECK-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR5]] -// CHECK-NEXT: ret void +// CHECK-GLOBAL-LABEL: @_ZN7striped16test_eleven_intsERN4sycl3_V19sub_groupENS1_4spanIiLm11EEEPU3AS1i( +// CHECK-GLOBAL-NEXT: entry: +// CHECK-GLOBAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-GLOBAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META146:![0-9]+]] +// CHECK-GLOBAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META149:![0-9]+]] +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-GLOBAL: for.cond.i.i: +// CHECK-GLOBAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-GLOBAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 11 +// CHECK-GLOBAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM11EPU3AS1INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-GLOBAL: for.body.i.i: +// CHECK-GLOBAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-GLOBAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-GLOBAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-GLOBAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-GLOBAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-GLOBAL-NEXT: store i32 [[TMP4]], ptr addrspace(1) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-GLOBAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-GLOBAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP152:![0-9]+]] +// CHECK-GLOBAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm11EPU3AS1iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-GLOBAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-GLOBAL-NEXT: ret void +// +// CHECK-LOCAL-LABEL: @_ZN7striped16test_eleven_intsERN4sycl3_V19sub_groupENS1_4spanIiLm11EEEPU3AS3i( +// CHECK-LOCAL-NEXT: entry: +// CHECK-LOCAL-NEXT: [[TMP0:%.*]] = load i64, ptr [[V:%.*]], align 8, !tbaa [[TBAA15]] +// CHECK-LOCAL-NEXT: [[TMP1:%.*]] = inttoptr i64 [[TMP0]] to ptr addrspace(4) +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupLocalInvocationId, align 4, !tbaa [[TBAA8]], !noalias [[META146:![0-9]+]] +// CHECK-LOCAL-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) @__spirv_BuiltInSubgroupSize, align 4, !tbaa [[TBAA8]], !noalias [[META149:![0-9]+]] +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I:%.*]] +// CHECK-LOCAL: for.cond.i.i: +// CHECK-LOCAL-NEXT: [[I_0_I_I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ] +// CHECK-LOCAL-NEXT: [[CMP_I_I:%.*]] = icmp samesign ult i32 [[I_0_I_I]], 11 +// CHECK-LOCAL-NEXT: br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL11GROUP_STOREINS0_9SUB_GROUPEILM11EPU3AS3INS3_10PROPERTIESINS3_6DETAIL20PROPERTIES_TYPE_LISTIJNS3_14PROPERTY_VALUEINS3_18DATA_PLACEMENT_KEYEJST17INTEGRAL_CONSTANTIILI1EEEEENSB_INS3_21CONTIGUOUS_MEMORY_KEYEJEEENSB_INS3_14FULL_GROUP_KEYEJEEENSB_INS9_25NATIVE_LOCAL_BLOCK_IO_KEYEJEEEEEEEEEENST9ENABLE_IFIXAAAASR6DETAILE18VERIFY_STORE_TYPESIT0_T2_ESR6DETAILE18IS_GENERIC_GROUP_VIT_E18IS_PROPERTY_LIST_VIT3_EEVE4TYPEESR_NS0_4SPANISP_XT1_EEESQ_SS__EXIT:%.*]] +// CHECK-LOCAL: for.body.i.i: +// CHECK-LOCAL-NEXT: [[CONV_I_I:%.*]] = zext nneg i32 [[I_0_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I_I:%.*]] = getelementptr inbounds nuw i32, ptr addrspace(4) [[TMP1]], i64 [[CONV_I_I]] +// CHECK-LOCAL-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[ARRAYIDX_I_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[MUL_I_I_I:%.*]] = mul i32 [[TMP3]], [[I_0_I_I]] +// CHECK-LOCAL-NEXT: [[ADD_I_I_I:%.*]] = add i32 [[TMP2]], [[MUL_I_I_I]] +// CHECK-LOCAL-NEXT: [[IDXPROM_I_I:%.*]] = sext i32 [[ADD_I_I_I]] to i64 +// CHECK-LOCAL-NEXT: [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P:%.*]], i64 [[IDXPROM_I_I]] +// CHECK-LOCAL-NEXT: store i32 [[TMP4]], ptr addrspace(3) [[ARRAYIDX_I_I]], align 4, !tbaa [[TBAA8]] +// CHECK-LOCAL-NEXT: [[INC_I_I]] = add nuw nsw i32 [[I_0_I_I]], 1 +// CHECK-LOCAL-NEXT: br label [[FOR_COND_I_I]], !llvm.loop [[LOOP152:![0-9]+]] +// CHECK-LOCAL: _ZN4sycl3_V13ext6oneapi12experimental11group_storeINS0_9sub_groupEiLm11EPU3AS3iNS3_10propertiesINS3_6detail20properties_type_listIJNS3_14property_valueINS3_18data_placement_keyEJSt17integral_constantIiLi1EEEEENSB_INS3_21contiguous_memory_keyEJEEENSB_INS3_14full_group_keyEJEEENSB_INS9_25native_local_block_io_keyEJEEEEEEEEEENSt9enable_ifIXaaaasr6detailE18verify_store_typesIT0_T2_Esr6detailE18is_generic_group_vIT_E18is_property_list_vIT3_EEvE4typeESR_NS0_4spanISP_XT1_EEESQ_SS_.exit: +// CHECK-LOCAL-NEXT: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 noundef 3, i32 noundef 3, i32 noundef 912) #[[ATTR7]] +// CHECK-LOCAL-NEXT: ret void // SYCL_EXTERNAL void test_eleven_ints(sycl::sub_group &sg, span v, - plain_global_ptr p) { + plain_ptr p) { // Non-power of two case bigger than max natively supported power of two case. group_store(sg, v, p, opt_striped{}); } From 0dcc42edd07f1e0973b060849d25a450725db9c3 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 24 Jan 2025 11:41:43 -0800 Subject: [PATCH 09/45] [SYCL][E2E] Pass freshly built OpenCL/L0 to e2e tests for in-tree configuration (#16742) Our CI uses "standalone" configuration for running E2E tests, but interactive development is (likely) mostly using in-tree configuration. With this change we can enable more tests running by default in such mode. --- sycl/test-e2e/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sycl/test-e2e/CMakeLists.txt b/sycl/test-e2e/CMakeLists.txt index 2379d7859e6a5..57227e447c54b 100644 --- a/sycl/test-e2e/CMakeLists.txt +++ b/sycl/test-e2e/CMakeLists.txt @@ -14,6 +14,12 @@ if(SYCL_TEST_E2E_STANDALONE) if( NOT OpenCL_LIBRARY ) find_package(OpenCL) endif() +else() + if( NOT OpenCL_LIBRARY ) + set(OpenCL_LIBRARY "${LLVM_BINARY_DIR}/lib") + endif() + set(LEVEL_ZERO_INCLUDE "${LLVM_BINARY_DIR}/_deps/level-zero-loader-src/include") + set(LEVEL_ZERO_LIBS_DIR "${LLVM_BINARY_DIR}/lib") endif() # Standalone. if(SYCL_TEST_E2E_STANDALONE) From 8a279544318a1408aa1997f8673cde4066e2a399 Mon Sep 17 00:00:00 2001 From: David Garcia Orozco Date: Fri, 24 Jan 2025 16:31:09 -0500 Subject: [PATCH 10/45] [SYCL][E2E] Add logic to react to `REQUIRED`/`UNSUPPORTED` in `build-only` (#16725) Adds logic for evaluating `REQUIRES`/`UNSUPPORTED` statements on `build-only` mode by "ignoring" features in these statements that do not affect the compilation (Everything other than OS, triple, and SDK/libraries). More precisely, the ability to ignore features is implemented by extending the boolean expressions to use a third boolean state - `ignore`. If a particular sub-expression includes an `ignore`, and if its result could be changed by setting that `ignore` to either `true` or `false`, then the result of that sub-expression is set to `ignore`. For example `ignore || true = true`, but `ignore || false = ignore`, this is because in the first example there would be no way to set the ignore such that the result is anything other than `true`, while in the second example the result is dependent on the "actual" value of the `ignore`. If the resulting value of a `REQUIRES` predicate is `ignore` we interpret that as `true` (The requirements were met), on the other hand for `UNSUPPORTED` predicates we would interpret `ignore` as `false` instead (The unsupported criteria was not met). The triples that can be used for a given test are then selected by evaluating the `REQUIRES`/`UNSUPPORTED` with the set of features + the target feature corresponding to each triple (mirroring the way we select devices), while ignoring all features that do not affect compilation. The target features available are the following: `target-spir`, `target-amd`, `target-nvidia`, and `target-native_cpu`. Each of these map to a triple. Similarly to how `XFAIL` is handled when using multiple devices if we are compiling for multiple triples, and a single triple is marked as `XFAIL`, then it is treated as unsupported instead. The available target triples in `build-only` mode is determined through the use of the `sycl_build_targets` lit param (i.e., `--param sycl_build_targets=spir;amd`). This is currently set to only `spir`, as the changes in test markup included in this pr do not take into account building for `nvptx` and `amdgcn` triples. In `run-only` and `full` mode the available triples are determined via the available devices. --- .../AmdNvidiaJIT/kernel_and_bundle.cpp | 3 +- .../windows_version_agnostic_sycl_lib.cpp | 1 - sycl/test-e2e/Compression/no_zstd_warning.cpp | 1 - .../DeviceLib/math_fp64_windows_test.cpp | 1 - sycl/test-e2e/DeviceLib/math_windows_test.cpp | 1 - sycl/test-e2e/E2EExpr.py | 167 ++++++++++++++++++ .../custom-command-cuda.cpp | 1 - .../custom-command-hip.cpp | 3 +- .../custom-command-multiple-dev-cuda.cpp | 1 - .../interop-task-cuda-buffer-migrate.cpp | 1 - .../HostInteropTask/interop-task-cuda.cpp | 1 - .../HostInteropTask/interop-task-hip.cpp | 3 +- .../is_compatible/is_compatible_amdgcn.cpp | 1 - .../is_compatible/is_compatible_nvptx64.cpp | 1 - .../is_compatible_several_targets.cpp | 3 +- .../is_compatible/is_compatible_spir64.cpp | 1 - .../is_compatible_spir64_fpga.cpp | 1 - .../is_compatible_spir64_gen.cpp | 1 - .../is_compatible_spir64_x86_64.cpp | 1 - .../Regression/compile_on_win_with_mdd.cpp | 1 - .../Regression/fsycl-host-compiler-win.cpp | 1 - sycl/test-e2e/Regression/msvc_crt.cpp | 1 - sycl/test-e2e/Regression/multiple-targets.cpp | 11 +- .../SpecConstants/2020/non_native/cuda.cpp | 1 - sycl/test-e2e/format.py | 135 +++++++++----- sycl/test-e2e/lit.cfg.py | 31 ++++ sycl/test-e2e/lit.site.cfg.py.in | 3 + .../check_e2eexpr_logic.cpp | 8 + 28 files changed, 311 insertions(+), 74 deletions(-) create mode 100644 sycl/test-e2e/E2EExpr.py create mode 100644 sycl/test/e2e_test_requirements/check_e2eexpr_logic.cpp diff --git a/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp b/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp index c1e52d1d4fc4e..3927fb87ee85d 100644 --- a/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp +++ b/sycl/test-e2e/AmdNvidiaJIT/kernel_and_bundle.cpp @@ -1,6 +1,5 @@ // UNSUPPORTED: windows -// REQUIRES: cuda || hip -// REQUIRES: build-and-run-mode +// REQUIRES: target-nvidia || target-amd // This test relies on debug output from a pass, make sure that the compiler // can generate it. diff --git a/sycl/test-e2e/Basic/windows_version_agnostic_sycl_lib.cpp b/sycl/test-e2e/Basic/windows_version_agnostic_sycl_lib.cpp index c9487829314d8..cf7538ed495bc 100644 --- a/sycl/test-e2e/Basic/windows_version_agnostic_sycl_lib.cpp +++ b/sycl/test-e2e/Basic/windows_version_agnostic_sycl_lib.cpp @@ -1,5 +1,4 @@ // REQUIRES: windows -// REQUIRES: build-and-run-mode // RUN: %clangxx --driver-mode=cl /std:c++17 /EHsc %sycl_include -I%opencl_include_dir %s -o %t.out /link /defaultlib:%sycl_static_libs_dir/sycl.lib // RUN: %{run} %t.out diff --git a/sycl/test-e2e/Compression/no_zstd_warning.cpp b/sycl/test-e2e/Compression/no_zstd_warning.cpp index 4532f22b9845b..c87f2fe480096 100644 --- a/sycl/test-e2e/Compression/no_zstd_warning.cpp +++ b/sycl/test-e2e/Compression/no_zstd_warning.cpp @@ -1,5 +1,4 @@ // using --offload-compress without zstd should throw an error. // REQUIRES: !zstd -// REQUIRES: build-and-run-mode // RUN: not %{build} %O0 -g --offload-compress %S/Inputs/single_kernel.cpp -o %t_compress.out 2>&1 | FileCheck %s // CHECK: '--offload-compress' option is specified but zstd is not available. The device image will not be compressed. diff --git a/sycl/test-e2e/DeviceLib/math_fp64_windows_test.cpp b/sycl/test-e2e/DeviceLib/math_fp64_windows_test.cpp index 2641408261426..71e8a43387093 100644 --- a/sycl/test-e2e/DeviceLib/math_fp64_windows_test.cpp +++ b/sycl/test-e2e/DeviceLib/math_fp64_windows_test.cpp @@ -1,5 +1,4 @@ // REQUIRES: aspect-fp64, windows -// REQUIRES: build-and-run-mode // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%} diff --git a/sycl/test-e2e/DeviceLib/math_windows_test.cpp b/sycl/test-e2e/DeviceLib/math_windows_test.cpp index 645493c496ae1..ab18cf1abe254 100644 --- a/sycl/test-e2e/DeviceLib/math_windows_test.cpp +++ b/sycl/test-e2e/DeviceLib/math_windows_test.cpp @@ -1,5 +1,4 @@ // REQUIRES: windows -// REQUIRES: build-and-run-mode // TODO: Add hypotf case back when the missing symbol is fixed. diff --git a/sycl/test-e2e/E2EExpr.py b/sycl/test-e2e/E2EExpr.py new file mode 100644 index 0000000000000..08dc667661803 --- /dev/null +++ b/sycl/test-e2e/E2EExpr.py @@ -0,0 +1,167 @@ +from lit.BooleanExpression import BooleanExpression + + +class E2EExpr(BooleanExpression): + build_specific_features = { + "build-and-run-mode", + "target-spir", + "target-nvidia", + "target-amd", + "target-native_cpu", + "any-target-is-spir", + "any-target-is-nvidia", + "any-target-is-amd", + "any-target-is-native_cpu", + "linux", + "system-linux", + "windows", + "system-windows", + "enable-perf-tests", + "preview-breaking-changes-supported", + "has_ndebug", + "ocloc", + "opencl-aot", + "opencl_icd", + "cm-compiler", + "xptifw", + "level_zero_dev_kit", + "cuda_dev_kit", + "zstd", + "vulkan", + "true", + "false", + } + + def __init__(self, string, variables, build_only_mode, final_unknown_value): + BooleanExpression.__init__(self, string, variables) + self.build_only_mode = build_only_mode + self.unknown = False + self.final_unknown_value = final_unknown_value + + @staticmethod + def evaluate(string, variables, build_only_mode, final_unknown_value=True): + """ + string: Expression to evaluate + variables: variables that evaluate to true + build_only_mode: if true enables unknown values + final_unknown_value: final boolean result if evaluation results in `unknown` + """ + try: + parser = E2EExpr( + string, set(variables), build_only_mode, final_unknown_value + ) + return parser.parseAll() + except ValueError as e: + raise ValueError(str(e) + ("\nin expression: %r" % string)) + + def parseMATCH(self): + token = self.token + BooleanExpression.parseMATCH(self) + if token not in self.build_specific_features and self.build_only_mode: + self.unknown = True + else: + self.unknown = False + if self.value and self.unknown: + raise ValueError( + 'Runtime feature "' + token + '" evaluated to True in build-only' + ) + + def parseAND(self): + self.parseNOT() + while self.accept("&&"): + left = self.value + left_unknown = self.unknown + self.parseNOT() + right = self.value + right_unknown = self.unknown + self.value = left and right + # Unknown if both are unknown or if one is true and the other is unknown + self.unknown = ( + (left_unknown and right_unknown) + or (left_unknown and right) + or (left and right_unknown) + ) + + def parseOR(self): + self.parseAND() + while self.accept("||"): + left = self.value + left_unknown = self.unknown + self.parseAND() + right = self.value + right_unknown = self.unknown + self.value = left or right + # Unknown if both are unknown or if one is false and the other is unknown + self.unknown = ( + (left_unknown and right_unknown) + or (left_unknown and not right) + or (not left and right_unknown) + ) + + def parseAll(self): + self.token = next(self.tokens) + self.parseOR() + self.expect(BooleanExpression.END) + return self.final_unknown_value if self.unknown else self.value + + +import unittest + + +class TestE2EExpr(unittest.TestCase): + def test_basic(self): + BuildOnly = True + BuildAndRun = False + RequiresDirective = True + UnsupportedDirective = False + RegularEval = lambda expr, features: E2EExpr.evaluate( + expr, features, BuildAndRun + ) + RequiresBuildEval = lambda expr, features: E2EExpr.evaluate( + expr, features, BuildOnly, RequiresDirective + ) + UnsupportedBuildEval = lambda expr, features: E2EExpr.evaluate( + expr, features, BuildOnly, UnsupportedDirective + ) + # Non build-only expressions should work the same + self.assertTrue(RegularEval("linux", {"linux", "rt_feature"})) + self.assertTrue(RegularEval("rt_feature", {"linux", "rt_feature"})) + self.assertFalse( + RegularEval("rt_feature1 && rt_feature2", {"linux", "rt_feature1"}) + ) + # build-only expressions with no unknowns should work the same + self.assertTrue(UnsupportedBuildEval("linux", {"linux"})) + self.assertFalse(RequiresBuildEval("linux && windows", {"linux"})) + self.assertTrue(UnsupportedBuildEval("!(windows || zstd)", {"linux"})) + # build-only expressions where unknown affects the resulting value + self.assertTrue(RequiresBuildEval("rt_feature", {})) + self.assertFalse(UnsupportedBuildEval("rt_feature", {})) + self.assertFalse(UnsupportedBuildEval("!rt_feature", {})) + self.assertTrue(RequiresBuildEval("windows || rt_feature", {"linux"})) + self.assertFalse(UnsupportedBuildEval("windows || rt_feature", {"linux"})) + self.assertTrue(RequiresBuildEval("linux && rt_feature", {"linux"})) + self.assertFalse(UnsupportedBuildEval("linux && rt_feature", {"linux"})) + self.assertTrue(RequiresBuildEval("linux && !(zstd || rt_feature)", {"linux"})) + self.assertFalse( + UnsupportedBuildEval("linux && !(zstd || rt_feature)", {"linux"}) + ) + # build-only expressions where unknown does not affect the resulting value + self.assertTrue(RequiresBuildEval("linux || rt_feature", {"linux"})) + self.assertTrue(UnsupportedBuildEval("linux || rt_feature", {"linux"})) + self.assertFalse(RequiresBuildEval("windows && rt_feature", {"linux"})) + self.assertFalse(UnsupportedBuildEval("windows && rt_feature", {"linux"})) + self.assertFalse( + RequiresBuildEval("linux && (vulkan && rt_feature)", {"linux"}) + ) + self.assertFalse( + UnsupportedBuildEval("linux && (vulkan && rt_feature)", {"linux"}) + ) + # runtime feature is present in build-only + with self.assertRaises(ValueError): + RequiresBuildEval("rt_feature", {"rt_feature"}) + with self.assertRaises(ValueError): + UnsupportedBuildEval("rt_feature", {"rt_feature"}) + + +if __name__ == "__main__": + unittest.main() diff --git a/sycl/test-e2e/EnqueueNativeCommand/custom-command-cuda.cpp b/sycl/test-e2e/EnqueueNativeCommand/custom-command-cuda.cpp index b777f314d2f92..7afbaf7cc14ed 100644 --- a/sycl/test-e2e/EnqueueNativeCommand/custom-command-cuda.cpp +++ b/sycl/test-e2e/EnqueueNativeCommand/custom-command-cuda.cpp @@ -1,7 +1,6 @@ // RUN: %{build} -Wno-error=deprecated-declarations -o %t.out %cuda_options // RUN: %{run} %t.out // REQUIRES: cuda, cuda_dev_kit -// REQUIRES: build-and-run-mode #include diff --git a/sycl/test-e2e/EnqueueNativeCommand/custom-command-hip.cpp b/sycl/test-e2e/EnqueueNativeCommand/custom-command-hip.cpp index 47bf42eee8158..00aafba08fc4a 100644 --- a/sycl/test-e2e/EnqueueNativeCommand/custom-command-hip.cpp +++ b/sycl/test-e2e/EnqueueNativeCommand/custom-command-hip.cpp @@ -2,8 +2,7 @@ // we should set this with some variable instead. // RUN: %{build} -Wno-error=deprecated-pragma -o %t.out -I%rocm_path/include -L%rocm_path/lib -lamdhip64 // RUN: %{run} %t.out -// REQUIRES: hip -// REQUIRES: build-and-run-mode +// REQUIRES: target-amd #include #include diff --git a/sycl/test-e2e/EnqueueNativeCommand/custom-command-multiple-dev-cuda.cpp b/sycl/test-e2e/EnqueueNativeCommand/custom-command-multiple-dev-cuda.cpp index f487b26026ec2..2dc30b44bfe94 100644 --- a/sycl/test-e2e/EnqueueNativeCommand/custom-command-multiple-dev-cuda.cpp +++ b/sycl/test-e2e/EnqueueNativeCommand/custom-command-multiple-dev-cuda.cpp @@ -1,5 +1,4 @@ // REQUIRES: cuda, cuda_dev_kit -// REQUIRES: build-and-run-mode // RUN: %{build} -o %t.out %cuda_options // RUN: %{run} %t.out diff --git a/sycl/test-e2e/HostInteropTask/interop-task-cuda-buffer-migrate.cpp b/sycl/test-e2e/HostInteropTask/interop-task-cuda-buffer-migrate.cpp index 2eea51a1ded9b..b6ac1f96f90e1 100644 --- a/sycl/test-e2e/HostInteropTask/interop-task-cuda-buffer-migrate.cpp +++ b/sycl/test-e2e/HostInteropTask/interop-task-cuda-buffer-migrate.cpp @@ -1,5 +1,4 @@ // REQUIRES: cuda, cuda_dev_kit -// REQUIRES: build-and-run-mode // // RUN: %{build} -o %t.out %cuda_options // RUN: %{run} %t.out diff --git a/sycl/test-e2e/HostInteropTask/interop-task-cuda.cpp b/sycl/test-e2e/HostInteropTask/interop-task-cuda.cpp index f3a61471d1d08..055160e8bb624 100644 --- a/sycl/test-e2e/HostInteropTask/interop-task-cuda.cpp +++ b/sycl/test-e2e/HostInteropTask/interop-task-cuda.cpp @@ -1,7 +1,6 @@ // RUN: %{build} -o %t.out %cuda_options // RUN: %{run} %t.out // REQUIRES: cuda, cuda_dev_kit -// REQUIRES: build-and-run-mode #include #include diff --git a/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp b/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp index 10e54f416f8a2..23856043230f2 100644 --- a/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp +++ b/sycl/test-e2e/HostInteropTask/interop-task-hip.cpp @@ -2,8 +2,7 @@ // we should set this with some variable instead. // RUN: %{build} -Wno-error=deprecated-pragma -Wno-error=deprecated-declarations -o %t.out -I%rocm_path/include -L%rocm_path/lib -lamdhip64 // RUN: %{run} %t.out -// REQUIRES: hip -// REQUIRES: build-and-run-mode +// REQUIRES: target-amd #include #include diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp index c0a1cb07db1e1..caf53df00e1d4 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_amdgcn.cpp @@ -1,5 +1,4 @@ // REQUIRES: hip, opencl, gpu, cpu -// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx906 -fsycl-targets=amdgcn-amd-amdhsa %S/Inputs/is_compatible_with_env.cpp -o %t.out diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_nvptx64.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_nvptx64.cpp index 20a5139fc12cd..ccfa829293c3f 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_nvptx64.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_nvptx64.cpp @@ -1,5 +1,4 @@ // REQUIRES: cuda, opencl, gpu, cpu -// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda %S/Inputs/is_compatible_with_env.cpp -o %t.out diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp index 3e9124954b774..eb7f4b0056162 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_several_targets.cpp @@ -1,8 +1,7 @@ // REQUIRES: ocloc, any-device-is-level_zero, any-device-is-gpu, any-device-is-cpu -// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen "-device *" %S/Inputs/is_compatible_with_env.cpp -o %t.out // RUN: env ONEAPI_DEVICE_SELECTOR=opencl:cpu %{run-unfiltered-devices} not %t.out // RUN: env ONEAPI_DEVICE_SELECTOR=opencl:gpu %{run-unfiltered-devices} %t.out -// RUN: env ONEAPI_DEVICE_SELECTOR=level_zero:gpu %{run-unfiltered-devices} %t.out \ No newline at end of file +// RUN: env ONEAPI_DEVICE_SELECTOR=level_zero:gpu %{run-unfiltered-devices} %t.out diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp index fcf6affb809fb..465a79056906a 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64.cpp @@ -1,5 +1,4 @@ // REQUIRES: cuda, opencl, gpu, cpu -// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64 %S/Inputs/is_compatible_with_env.cpp -o %t.out diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp index 1372c352c09ea..57366482e7082 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_fpga.cpp @@ -1,5 +1,4 @@ // REQUIRES: opencl-aot, accelerator, gpu, cpu -// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_fpga %S/Inputs/is_compatible_with_env.cpp -o %t.out diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp index c6a01b3a6dc18..5adb27e0ae697 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_gen.cpp @@ -1,5 +1,4 @@ // REQUIRES: ocloc, gpu, level_zero, cpu -// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend "-device *" %S/Inputs/is_compatible_with_env.cpp -o %t.out diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp index 5de21b8984d71..0a6f2c39df8af 100644 --- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp +++ b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_spir64_x86_64.cpp @@ -1,5 +1,4 @@ // REQUIRES: opencl-aot, cpu, gpu, level_zero -// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64 %S/Inputs/is_compatible_with_env.cpp -o %t.out diff --git a/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp b/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp index ea1e1c7d49891..57826c81ece2e 100644 --- a/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp +++ b/sycl/test-e2e/Regression/compile_on_win_with_mdd.cpp @@ -1,5 +1,4 @@ // REQUIRES: windows -// REQUIRES: build-and-run-mode // RUN: %clangxx --driver-mode=cl -fsycl /MDd -c %s -o %t.obj // RUN: %clangxx --driver-mode=cl -fsycl %t.obj -Wno-unused-command-line-argument -o %t.out diff --git a/sycl/test-e2e/Regression/fsycl-host-compiler-win.cpp b/sycl/test-e2e/Regression/fsycl-host-compiler-win.cpp index e4073831bb71b..4ffe42c45c52f 100644 --- a/sycl/test-e2e/Regression/fsycl-host-compiler-win.cpp +++ b/sycl/test-e2e/Regression/fsycl-host-compiler-win.cpp @@ -1,7 +1,6 @@ // RUN: %{build} -fsycl-host-compiler=cl -DDEFINE_CHECK -fsycl-host-compiler-options="-DDEFINE_CHECK /std:c++17 /Zc:__cplusplus" -o %t.exe // RUN: %{run} %t.exe // REQUIRES: windows -// REQUIRES: build-and-run-mode // //==------- fsycl-host-compiler-win.cpp - external host compiler test ------==// // diff --git a/sycl/test-e2e/Regression/msvc_crt.cpp b/sycl/test-e2e/Regression/msvc_crt.cpp index a54570efd820c..9d59547e50d7b 100644 --- a/sycl/test-e2e/Regression/msvc_crt.cpp +++ b/sycl/test-e2e/Regression/msvc_crt.cpp @@ -3,7 +3,6 @@ // RUN: %{build} /MDd -o %t2.exe // RUN: %{run} %t2.exe // REQUIRES: system-windows, cl_options -// REQUIRES: build-and-run-mode //==-------------- msvc_crt.cpp - SYCL MSVC CRT test -----------------------==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. diff --git a/sycl/test-e2e/Regression/multiple-targets.cpp b/sycl/test-e2e/Regression/multiple-targets.cpp index aa8c125d90738..9e1680453ccdb 100644 --- a/sycl/test-e2e/Regression/multiple-targets.cpp +++ b/sycl/test-e2e/Regression/multiple-targets.cpp @@ -2,18 +2,17 @@ // It tests if the target triples can be specified with any order. // The test is repeated for per_kernel device code splitting. // -// REQUIRES: cuda || hip || native_cpu -// REQUIRES: build-and-run-mode -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple},spir64 %if any-device-is-hip %{ %{hip_arch_opts} %} -o %t1.out %s +// REQUIRES: (target-nvidia || target-amd || target-native_cpu) && any-target-is-spir +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple},spir64 %if any-target-is-amd %{ %{hip_arch_opts} %} -o %t1.out %s // RUN: %{run} %t1.out // -// RUN: %clangxx -fsycl -fsycl-targets=spir64,%{sycl_triple} %if any-device-is-hip %{ %{hip_arch_opts} %} -o %t2.out %s +// RUN: %clangxx -fsycl -fsycl-targets=spir64,%{sycl_triple} %if any-target-is-amd %{ %{hip_arch_opts} %} -o %t2.out %s // RUN: %{run} %t2.out // -// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple},spir64 %if any-device-is-hip %{ %{hip_arch_opts} %} -fsycl-device-code-split=per_kernel -o %t3.out %s +// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple},spir64 %if any-target-is-amd %{ %{hip_arch_opts} %} -fsycl-device-code-split=per_kernel -o %t3.out %s // RUN: %{run} %t3.out // -// RUN: %clangxx -fsycl -fsycl-targets=spir64,%{sycl_triple} %if any-device-is-hip %{ %{hip_arch_opts} %} -fsycl-device-code-split=per_kernel -o %t4.out %s +// RUN: %clangxx -fsycl -fsycl-targets=spir64,%{sycl_triple} %if any-target-is-amd %{ %{hip_arch_opts} %} -fsycl-device-code-split=per_kernel -o %t4.out %s // RUN: %{run} %t4.out #include diff --git a/sycl/test-e2e/SpecConstants/2020/non_native/cuda.cpp b/sycl/test-e2e/SpecConstants/2020/non_native/cuda.cpp index af12b66208dad..12d2ce6a9faf9 100644 --- a/sycl/test-e2e/SpecConstants/2020/non_native/cuda.cpp +++ b/sycl/test-e2e/SpecConstants/2020/non_native/cuda.cpp @@ -1,5 +1,4 @@ // REQUIRES: cuda -// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda %S/Inputs/common.cpp -o %t.out // RUN: %{run-unfiltered-devices} env ONEAPI_DEVICE_SELECTOR="cuda:*" %t.out diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py index eac6ebe0ba2ed..a9e98a4ed8037 100644 --- a/sycl/test-e2e/format.py +++ b/sycl/test-e2e/format.py @@ -2,27 +2,16 @@ import lit.formats import platform -from lit.BooleanExpression import BooleanExpression from lit.TestRunner import ( ParserKind, IntegratedTestKeywordParser, - # parseIntegratedTestScript, ) +from E2EExpr import E2EExpr import os import re -def get_triple(backend): - if backend == "cuda": - return "nvptx64-nvidia-cuda" - if backend == "hip": - return "amdgcn-amd-amdhsa" - if backend == "native_cpu": - return "native_cpu" - return "spir64" - - def parse_min_intel_driver_req(line_number, line, output): """ Driver version looks like this for Intel devices: @@ -87,22 +76,85 @@ def parseTestScript(self, test): return script - def getMatchedFromList(self, features, alist): + def getMatchedFromList( + self, features, expression_list, build_only_mode, is_requires_directive + ): try: return [ - item for item in alist if BooleanExpression.evaluate(item, features) + item + for item in expression_list + if E2EExpr.evaluate( + item, features, build_only_mode, is_requires_directive + ) + != is_requires_directive ] except ValueError as e: - raise ValueError("Error in UNSUPPORTED list:\n%s" % str(e)) + raise ValueError("Error in expression:\n%s" % str(e)) + + BuildOnly = True + BuildAndRun = False + RequiresDirective = True + UnsupportedDirective = False + + def getMissingRequires(self, features, expression_list): + return self.getMatchedFromList( + features, expression_list, self.BuildAndRun, self.RequiresDirective + ) + + def getMissingRequiresBuildOnly(self, features, expression_list): + return self.getMatchedFromList( + features, expression_list, self.BuildOnly, self.RequiresDirective + ) + + def getMatchedUnsupported(self, features, expression_list): + return self.getMatchedFromList( + features, expression_list, self.BuildAndRun, self.UnsupportedDirective + ) + + def getMatchedUnsupportedBuildOnly(self, features, expression_list): + return self.getMatchedFromList( + features, expression_list, self.BuildOnly, self.UnsupportedDirective + ) + + getMatchedXFail = getMatchedUnsupported + + def select_build_targets_for_test(self, test): + supported_targets = set() + for t in test.config.sycl_build_targets: + features = test.config.available_features.union({t}) + if self.getMissingRequiresBuildOnly(features, test.requires): + continue + if self.getMatchedUnsupportedBuildOnly(features, test.unsupported): + continue + supported_targets.add(t) + + if len(supported_targets) <= 1: + return supported_targets + + # Treat XFAIL as UNSUPPORTED if the test is to be compiled for multiple + # triples. + + if "*" in test.xfails: + return [] + + triples_without_xfail = [ + t + for t in supported_targets + if not self.getMatchedXFail( + test.config.available_features.union({t}), test.xfails + ) + ] + + return triples_without_xfail def select_devices_for_test(self, test): devices = [] for d in test.config.sycl_devices: features = test.config.sycl_dev_features[d] - if test.getMissingRequiredFeaturesFromList(features): + if self.getMissingRequires(features, test.requires): continue - if self.getMatchedFromList(features, test.unsupported): + if self.getMatchedUnsupported(features, test.unsupported): continue driver_ok = True @@ -134,9 +186,7 @@ def select_devices_for_test(self, test): devices_without_xfail = [ d for d in devices - if not self.getMatchedFromList( - test.config.sycl_dev_features[d], test.xfails - ) + if not self.getMatchedXFail(test.config.sycl_dev_features[d], test.xfails) ] return devices_without_xfail @@ -152,13 +202,13 @@ def execute(self, test, litConfig): return script devices_for_test = [] - triples = set() + build_targets = set() if test.config.test_mode == "build-only": - if "build-and-run-mode" in test.requires or "true" in test.unsupported: + build_targets = self.select_build_targets_for_test(test) + if not build_targets: return lit.Test.Result( - lit.Test.UNSUPPORTED, "Test unsupported for this environment" + lit.Test.UNSUPPORTED, "No supported triple to build for" ) - triples = {"spir64"} else: devices_for_test = self.select_devices_for_test(test) if not devices_for_test: @@ -168,14 +218,16 @@ def execute(self, test, litConfig): for sycl_device in devices_for_test: (backend, _) = sycl_device.split(":") - triples.add(get_triple(backend)) + build_targets.add(test.config.backend_to_target[backend]) + + triples = set(test.config.target_to_triple[t] for t in build_targets) substitutions = lit.TestRunner.getDefaultSubstitutions(test, tmpDir, tmpBase) substitutions.append(("%{sycl_triple}", format(",".join(triples)))) sycl_target_opts = "-fsycl-targets=%{sycl_triple}" - if get_triple("hip") in triples: + if "target-amd" in build_targets: hip_arch_opts = ( " -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch={}".format( test.config.amd_arch @@ -184,7 +236,7 @@ def execute(self, test, litConfig): sycl_target_opts += hip_arch_opts substitutions.append(("%{hip_arch_opts}", hip_arch_opts)) if ( - get_triple("spir64") in triples + "target-spir" in build_targets and "spirv-backend" in test.config.available_features ): sycl_target_opts += " -fsycl-use-spirv-backend-for-spirv-gen" @@ -323,19 +375,18 @@ def get_extra_env(sycl_devices): test, litConfig, useExternalSh, script, tmpBase ) - if len(devices_for_test) > 1 or test.config.test_mode == "build-only": - return result - - # Single device - might be an XFAIL. - device = devices_for_test[0] - if "*" in test.xfails or self.getMatchedFromList( - test.config.sycl_dev_features[device], test.xfails - ): - if result.code is lit.Test.PASS: - result.code = lit.Test.XPASS - # fail -> expected fail - elif result.code is lit.Test.FAIL: - result.code = lit.Test.XFAIL - return result - + # Single triple/device - might be an XFAIL. + def map_result(features, code): + if "*" in test.xfails or self.getMatchedXFail(features, test.xfails): + if code is lit.Test.PASS: + code = lit.Test.XPASS + elif code is lit.Test.FAIL: + code = lit.Test.XFAIL + return code + + if len(triples) == 1 and test.config.test_mode == "build-only": + result.code = map_result(test.config.available_features, result.code) + if len(devices_for_test) == 1: + device = devices_for_test[0] + result.code = map_result(test.config.sycl_dev_features[device], result.code) return result diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py index 5b60d93387b7a..c58f702e6f5d5 100644 --- a/sycl/test-e2e/lit.cfg.py +++ b/sycl/test-e2e/lit.cfg.py @@ -15,6 +15,23 @@ from lit.llvm.subst import ToolSubst, FindTool # Configuration file for the 'lit' test runner. +config.backend_to_target = { + "level_zero": "target-spir", + "opencl": "target-spir", + "cuda": "target-nvidia", + "hip": "target-amd", + "native_cpu": "target-native_cpu", +} +config.target_to_triple = { + "target-spir": "spir64", + "target-nvidia": "nvptx64-nvidia-cuda", + "target-amd": "amdgcn-amd-amdhsa", + "target-native_cpu": "native_cpu", +} +config.triple_to_target = {v: k for k, v in config.target_to_triple.items()} +config.backend_to_triple = { + k: config.target_to_triple.get(v) for k, v in config.backend_to_target.items() +} # name: The name of this test suite. config.name = "SYCL" @@ -52,6 +69,8 @@ elif config.test_mode == "build-only": lit_config.note("build-only test mode enabled, only compiling tests") config.sycl_devices = [] + if not config.amd_arch: + config.amd_arch = "gfx1031" else: lit_config.error("Invalid argument for test-mode") @@ -677,12 +696,21 @@ def open_check_file(file_name): "Couldn't find pre-installed AOT device compiler " + aot_tool ) +# Clear build targets when not in build-only, to populate according to devices +if config.test_mode != "build-only": + config.sycl_build_targets = set() + for sycl_device in config.sycl_devices: be, dev = sycl_device.split(":") config.available_features.add("any-device-is-" + dev) # Use short names for LIT rules. config.available_features.add("any-device-is-" + be) + target = config.backend_to_target[be] + config.sycl_build_targets.add(target) + +for target in config.sycl_build_targets: + config.available_features.add("any-target-is-" + target.replace("target-", "")) # That has to be executed last so that all device-independent features have been # discovered already. config.sycl_dev_features = {} @@ -826,6 +854,9 @@ def open_check_file(file_name): features.add(dev.replace("fpga", "accelerator")) # Use short names for LIT rules. features.add(be) + # Add corresponding target feature + target = config.backend_to_target[be] + features.add(target) if be == "hip": if not config.amd_arch: diff --git a/sycl/test-e2e/lit.site.cfg.py.in b/sycl/test-e2e/lit.site.cfg.py.in index 00928dd9141fc..02f4125a4680f 100644 --- a/sycl/test-e2e/lit.site.cfg.py.in +++ b/sycl/test-e2e/lit.site.cfg.py.in @@ -30,6 +30,9 @@ config.igc_tag_file = os.path.join("/usr/local/lib/igc/", 'IGCTAG.txt') config.sycl_devices = lit_config.params.get("sycl_devices", "@SYCL_TEST_E2E_TARGETS@").split(';') +# FIXME: current test markup only supports spir in build-only +config.sycl_build_targets = set("target-" + t for t in lit_config.params.get("sycl_build_targets", "spir").split(';')) + config.amd_arch = lit_config.params.get("amd_arch", "@AMD_ARCH@") config.sycl_threads_lib = '@SYCL_THREADS_LIB@' config.extra_environment = lit_config.params.get("extra_environment", "@LIT_EXTRA_ENVIRONMENT@") diff --git a/sycl/test/e2e_test_requirements/check_e2eexpr_logic.cpp b/sycl/test/e2e_test_requirements/check_e2eexpr_logic.cpp new file mode 100644 index 0000000000000..2e394d4a78687 --- /dev/null +++ b/sycl/test/e2e_test_requirements/check_e2eexpr_logic.cpp @@ -0,0 +1,8 @@ +// E2E tests use a modified expression parser that allows for a third "unknown" +// boolean state to handle missing run-time features in REQUIRES/UNSUPPORTED +// statements. This test runs the unit tests related to these expressions. +// +// REQUIRES: linux +// DEFINE: %{E2EExpr}=%S/../../test-e2e/E2EExpr.py +// DEFINE: %{lit_source}=%S/../../../llvm/utils/lit +// RUN: env PYTHONPATH=%{lit_source} python %{E2EExpr} From 155fe36a9407a8e946761628561f2deee9fbc5a7 Mon Sep 17 00:00:00 2001 From: Artur Gainullin Date: Fri, 24 Jan 2025 14:01:43 -0800 Subject: [PATCH 11/45] [SYCL] Fix assertion at host pipe initialization in case of multiple TUs (#16756) If there are multiple translation units using the host pipe then initialize function will be called multiple times with the same pointer because special function for pipe registration and initialization (__sycl_host_pipe_registration) has to be emitted by frontend for each translation unit. Just assert that pointer is the same. --- sycl/source/detail/host_pipe_map_entry.hpp | 13 +++++++++++- .../Basic/fpga_tests/Inputs/kernel.cpp | 7 +++++++ .../Basic/fpga_tests/Inputs/mypipe.hpp | 11 ++++++++++ .../fpga_host_pipe_multiple_tus.cpp | 20 +++++++++++++++++++ 4 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 sycl/test-e2e/Basic/fpga_tests/Inputs/kernel.cpp create mode 100644 sycl/test-e2e/Basic/fpga_tests/Inputs/mypipe.hpp create mode 100644 sycl/test-e2e/Basic/fpga_tests/fpga_host_pipe_multiple_tus.cpp diff --git a/sycl/source/detail/host_pipe_map_entry.hpp b/sycl/source/detail/host_pipe_map_entry.hpp index e27581df32a6f..4fc80a6df1071 100644 --- a/sycl/source/detail/host_pipe_map_entry.hpp +++ b/sycl/source/detail/host_pipe_map_entry.hpp @@ -43,7 +43,18 @@ struct HostPipeMapEntry { } void initialize(const void *HostPipePtr) { - assert(!MHostPipePtr && "Host pipe pointer has already been initialized."); + // If there are multiple translation units using the host pipe then + // initialize function will be called multiple times with the same pointer + // because special function for pipe registration and initialization has to + // be emitted by frontend for each translation unit. Just make sure that + // pointer is the same. + if (MHostPipePtr) { + assert(MHostPipePtr == HostPipePtr && + "Host pipe intializations disagree on address of the host pipe on " + "host."); + return; + } + MHostPipePtr = HostPipePtr; } diff --git a/sycl/test-e2e/Basic/fpga_tests/Inputs/kernel.cpp b/sycl/test-e2e/Basic/fpga_tests/Inputs/kernel.cpp new file mode 100644 index 0000000000000..83cd1abc09ff1 --- /dev/null +++ b/sycl/test-e2e/Basic/fpga_tests/Inputs/kernel.cpp @@ -0,0 +1,7 @@ +#include "mypipe.hpp" +#include + +void KernelFunctor::operator()() const { + uint32_t data = 2; + my_pipe::write(data); +} diff --git a/sycl/test-e2e/Basic/fpga_tests/Inputs/mypipe.hpp b/sycl/test-e2e/Basic/fpga_tests/Inputs/mypipe.hpp new file mode 100644 index 0000000000000..b1938993b6050 --- /dev/null +++ b/sycl/test-e2e/Basic/fpga_tests/Inputs/mypipe.hpp @@ -0,0 +1,11 @@ +#include +#include +#include + +#pragma once + +struct KernelFunctor { + using my_pipe = + sycl::ext::intel::experimental::pipe; + SYCL_EXTERNAL void operator()() const; +}; diff --git a/sycl/test-e2e/Basic/fpga_tests/fpga_host_pipe_multiple_tus.cpp b/sycl/test-e2e/Basic/fpga_tests/fpga_host_pipe_multiple_tus.cpp new file mode 100644 index 0000000000000..7b2f9fd4673be --- /dev/null +++ b/sycl/test-e2e/Basic/fpga_tests/fpga_host_pipe_multiple_tus.cpp @@ -0,0 +1,20 @@ +// REQUIRES: accelerator +// RUN: %clangxx -fsycl -fintelfpga %s %S/Inputs/kernel.cpp -I%S/Inputs -o %t.out +// RUN: %{run} %t.out | FileCheck %s + +// Test checks that host pipe initialization doesn't fail if pipe is used in +// multiple translation units. + +#include "mypipe.hpp" +#include + +int main() { + sycl::queue q{sycl::ext::intel::fpga_emulator_selector_v}; + q.submit([&](sycl::handler &cgh) { cgh.single_task(KernelFunctor{}); }); + uint32_t result = KernelFunctor::my_pipe::read(q); + q.wait(); + // CHECK: 2 + std::cout << result << std::endl; + + return 0; +} From 1dbe40360ec3e9da304b1a68df4995b3399917e9 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Fri, 24 Jan 2025 14:36:15 -0800 Subject: [PATCH 12/45] [CI] Switch default `merge_ref` to empty (#16772) There were a few places that didn't pass explicit `''` and all of them were actual bugs: * `sycl-nightly.yml` didn't pass it for windows build and run-tests * `sycl-nightly.yml` didn't pass it for `ubuntu2404_oneapi_build` job (passed for other linux build jobs) --- .github/workflows/sycl-linux-build.yml | 8 -------- .github/workflows/sycl-linux-precommit-aws.yml | 1 - .github/workflows/sycl-linux-precommit.yml | 4 ---- .github/workflows/sycl-linux-run-tests.yml | 8 -------- .github/workflows/sycl-nightly.yml | 5 ----- .github/workflows/sycl-post-commit.yml | 6 +----- .github/workflows/sycl-rel-nightly.yml | 5 ----- .github/workflows/sycl-weekly.yml | 1 - .github/workflows/sycl-windows-build.yml | 7 ------- .github/workflows/sycl-windows-precommit.yml | 2 -- .github/workflows/sycl-windows-run-tests.yml | 8 -------- devops/actions/cached_checkout/action.yml | 2 +- devops/actions/run-tests/e2e/action.yml | 3 --- 13 files changed, 2 insertions(+), 58 deletions(-) diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml index d528aa6ce7ce3..64ec651b44643 100644 --- a/.github/workflows/sycl-linux-build.yml +++ b/.github/workflows/sycl-linux-build.yml @@ -40,12 +40,6 @@ on: description: 'Filter matches for the changed files in the PR' default: '[llvm, clang, sycl, llvm_spirv, xptifw, libclc, libdevice]' required: false - merge_ref: - description: | - Commit-ish to merge post-checkout if non-empty. Must be reachable from - the default_branch input paramter. - type: string - default: 'FETCH_HEAD' retention-days: description: 'Artifacts retention period' type: string @@ -150,7 +144,6 @@ jobs: with: path: src ref: ${{ inputs.build_ref || github.sha }} - merge_ref: ${{ inputs.merge_ref }} cache_path: "/__w/repo_cache/" - name: Setup oneAPI env if: ${{ inputs.cc == 'icx' || inputs.cxx == 'icpx' }} @@ -281,7 +274,6 @@ jobs: uses: ./devops/actions/run-tests/e2e with: ref: ${{ inputs.ref || github.sha }} - merge_ref: ${{ inputs.merge_ref }} e2e_testing_mode: build-only target_devices: all artifact_suffix: default diff --git a/.github/workflows/sycl-linux-precommit-aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml index 297f7defb7c1b..49dd9b245b5c7 100644 --- a/.github/workflows/sycl-linux-precommit-aws.yml +++ b/.github/workflows/sycl-linux-precommit-aws.yml @@ -72,7 +72,6 @@ jobs: # No idea why but that seems to work and be in sync with the main # pre-commit workflow. ref: ${{ github.event.workflow_run.referenced_workflows[0].sha }} - merge_ref: '' sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: llvm_sycl.tar.zst diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index 813bc04f2a695..a4f36c4aeae23 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -41,7 +41,6 @@ jobs: uses: ./.github/workflows/sycl-linux-build.yml with: build_ref: ${{ github.sha }} - merge_ref: '' build_cache_root: "/__w/" build_artifact_suffix: "default" build_cache_suffix: "default" @@ -97,7 +96,6 @@ jobs: extra_lit_opts: --param fallback-to-build-if-requires-build-and-run=True ${{ matrix.extra_lit_opts }} reset_intel_gpu: ${{ matrix.reset_intel_gpu }} ref: ${{ github.sha }} - merge_ref: '' sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} sycl_toolchain_decompress_command: ${{ needs.build.outputs.artifact_decompress_command }} @@ -179,7 +177,6 @@ jobs: skip_run: ${{matrix.use_igc_dev && contains(github.event.pull_request.labels.*.name, 'ci-no-devigc') || 'false'}} ref: ${{ github.sha }} - merge_ref: '' sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} @@ -228,7 +225,6 @@ jobs: extra_lit_opts: -a -j 1 --param enable-perf-tests=True ref: ${{ github.sha }} - merge_ref: '' sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml index b3b4f62e370db..ca9645c947e25 100644 --- a/.github/workflows/sycl-linux-run-tests.yml +++ b/.github/workflows/sycl-linux-run-tests.yml @@ -38,13 +38,6 @@ on: ref: type: string required: True - merge_ref: - description: | - Commit-ish to merge post-checkout if non-empty. Must be reachable from - the default_branch input paramter. - type: string - default: 'FETCH_HEAD' - required: False sycl_toolchain_artifact: type: string @@ -296,7 +289,6 @@ jobs: uses: ./devops/actions/run-tests/e2e with: ref: ${{ inputs.ref || github.sha }} - merge_ref: ${{ inputs.merge_ref }} e2e_binaries_artifact: ${{ inputs.e2e_binaries_artifact }} extra_cmake_args: ${{ inputs.extra_cmake_args }} e2e_testing_mode: ${{ inputs.e2e_testing_mode }} diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index 5485719d60141..52510b54c76a5 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -17,7 +17,6 @@ jobs: build_artifact_suffix: default build_configure_extra_args: '--hip --cuda' build_image: ghcr.io/intel/llvm/ubuntu2204_build:latest - merge_ref: '' retention-days: 90 # We upload the build for people to download/use, override its name and @@ -33,7 +32,6 @@ jobs: build_cache_suffix: sprod_shared build_artifact_suffix: sprod_shared build_configure_extra_args: '--shared-libs --hip --cuda --native_cpu' - merge_ref: '' artifact_archive_name: sycl_linux_shared.tar.zst @@ -112,7 +110,6 @@ jobs: extra_lit_opts: "--param 'cxx_flags=-D_GLIBCXX_USE_CXX11_ABI=0' ${{ matrix.extra_lit_opts }}" reset_intel_gpu: ${{ matrix.reset_intel_gpu }} ref: ${{ github.sha }} - merge_ref: '' sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }} sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }} @@ -129,7 +126,6 @@ jobs: image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN ref: ${{ github.sha }} - merge_ref: '' sycl_toolchain_artifact: sycl_linux_oneapi sycl_toolchain_archive: ${{ needs.ubuntu2404_oneapi_build.outputs.artifact_archive_name }} sycl_toolchain_decompress_command: ${{ needs.ubuntu2404_oneapi_build.outputs.artifact_decompress_command }} @@ -175,7 +171,6 @@ jobs: image_options: -u 1001 --gpus all --cap-add SYS_ADMIN --env NVIDIA_DISABLE_REQUIRE=1 target_devices: cuda:gpu ref: ${{ github.sha }} - merge_ref: '' sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }} diff --git a/.github/workflows/sycl-post-commit.yml b/.github/workflows/sycl-post-commit.yml index ec90056d8ed00..f3e4c224f897a 100644 --- a/.github/workflows/sycl-post-commit.yml +++ b/.github/workflows/sycl-post-commit.yml @@ -36,7 +36,6 @@ jobs: build_cache_suffix: default build_artifact_suffix: default build_configure_extra_args: --no-assertions --hip --cuda --native_cpu --cmake-opt="-DSYCL_ENABLE_STACK_PRINTING=ON" --cmake-opt="-DSYCL_LIB_WITH_DEBUG_SYMBOL=ON" - merge_ref: '' e2e-lin: needs: [build-lin] @@ -90,7 +89,6 @@ jobs: env: ${{ matrix.env || '{}' }} ref: ${{ github.sha }} - merge_ref: '' sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.build-lin.outputs.artifact_archive_name }} @@ -106,8 +104,7 @@ jobs: compiler: icx build_configure_extra_args: --cmake-opt=-DCMAKE_C_FLAGS="/fp:precise /clang:-Wno-nonportable-include-path /clang:-Wno-cast-function-type-mismatch" --cmake-opt=-DCMAKE_CXX_FLAGS="/fp:precise /clang:-Wno-nonportable-include-path /clang:-Wno-cast-function-type-mismatch" --cmake-opt="-DCMAKE_EXE_LINKER_FLAGS=/manifest:no" --cmake-opt="-DCMAKE_MODULE_LINKER_FLAGS=/manifest:no" --cmake-opt="-DCMAKE_SHARED_LINKER_FLAGS=/manifest:no" build_cache_suffix: icx - merge_ref: '' - + e2e-win: needs: build-win # Continue if build was successful. @@ -121,7 +118,6 @@ jobs: runner: '["Windows","gen12"]' sycl_toolchain_archive: ${{ needs.build-win.outputs.artifact_archive_name }} compiler: icx - merge_ref: '' macos_default: name: macOS diff --git a/.github/workflows/sycl-rel-nightly.yml b/.github/workflows/sycl-rel-nightly.yml index ae33d157e2f7a..803eca00cf857 100644 --- a/.github/workflows/sycl-rel-nightly.yml +++ b/.github/workflows/sycl-rel-nightly.yml @@ -39,7 +39,6 @@ jobs: build_artifact_suffix: default build_configure_extra_args: '--hip --cuda' build_image: ghcr.io/intel/llvm/ubuntu2204_build:latest - merge_ref: '' build_ref: sycl-rel-6_0_0 # We upload the build for people to download/use, override its name and @@ -109,7 +108,6 @@ jobs: extra_lit_opts: ${{ matrix.extra_lit_opts }} reset_intel_gpu: ${{ matrix.reset_intel_gpu }} ref: sycl-rel-6_0_0 - merge_ref: '' sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }} sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }} @@ -120,7 +118,6 @@ jobs: uses: ./.github/workflows/sycl-windows-build.yml with: ref: sycl-rel-6_0_0 - merge_ref: '' # We upload both Linux/Windows build via Github's "Releases" # functionality, make sure Linux/Windows names follow the same pattern. @@ -140,7 +137,6 @@ jobs: sycl_toolchain_archive: ${{ needs.build-win.outputs.artifact_archive_name }} extra_lit_opts: --param gpu-intel-gen12=True ref: sycl-rel-6_0_0 - merge_ref: '' cuda-aws-start: needs: [ubuntu2204_build] @@ -162,7 +158,6 @@ jobs: image_options: -u 1001 --gpus all --cap-add SYS_ADMIN --env NVIDIA_DISABLE_REQUIRE=1 target_devices: cuda:gpu ref: sycl-rel-6_0_0 - merge_ref: '' sycl_toolchain_artifact: sycl_linux_default sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }} diff --git a/.github/workflows/sycl-weekly.yml b/.github/workflows/sycl-weekly.yml index d6dcf3114695e..0974470b972a2 100644 --- a/.github/workflows/sycl-weekly.yml +++ b/.github/workflows/sycl-weekly.yml @@ -19,7 +19,6 @@ jobs: build_cache_root: "/__w/" build_artifact_suffix: default build_configure_extra_args: '' - merge_ref: '' build-sycl-cts: needs: ubuntu2204_build diff --git a/.github/workflows/sycl-windows-build.yml b/.github/workflows/sycl-windows-build.yml index 9a19cf677e07e..ef9c75b860539 100644 --- a/.github/workflows/sycl-windows-build.yml +++ b/.github/workflows/sycl-windows-build.yml @@ -18,12 +18,6 @@ on: ref: type: string required: False - merge_ref: - description: | - Commit-ish to merge post-checkout if non-empty. Must be reachable from - the default_branch input paramter. - type: string - default: 'FETCH_HEAD' artifact_archive_name: type: string default: llvm_sycl.tar.gz @@ -106,7 +100,6 @@ jobs: with: path: src ref: ${{ inputs.ref || github.sha }} - merge_ref: ${{ inputs.merge_ref }} cache_path: "D:\\\\github\\\\_work\\\\repo_cache\\\\" - name: Configure shell: cmd diff --git a/.github/workflows/sycl-windows-precommit.yml b/.github/workflows/sycl-windows-precommit.yml index b4ba46c08d429..9dafe3e862f8c 100644 --- a/.github/workflows/sycl-windows-precommit.yml +++ b/.github/workflows/sycl-windows-precommit.yml @@ -42,7 +42,6 @@ jobs: uses: ./.github/workflows/sycl-windows-build.yml with: changes: ${{ needs.detect_changes.outputs.filters }} - merge_ref: '' e2e: needs: build @@ -56,4 +55,3 @@ jobs: name: Intel GEN12 Graphics with Level Zero runner: '["Windows","gen12"]' sycl_toolchain_archive: ${{ needs.build.outputs.artifact_archive_name }} - merge_ref: '' diff --git a/.github/workflows/sycl-windows-run-tests.yml b/.github/workflows/sycl-windows-run-tests.yml index dbd4d7ff439ed..3a941ac4c4f8e 100644 --- a/.github/workflows/sycl-windows-run-tests.yml +++ b/.github/workflows/sycl-windows-run-tests.yml @@ -18,13 +18,6 @@ on: ref: type: string required: False - merge_ref: - description: | - Commit-ish to merge post-checkout if non-empty. Must be reachable from - the default_branch input paramter. - type: string - default: 'FETCH_HEAD' - required: False sycl_toolchain_artifact: type: string @@ -76,7 +69,6 @@ jobs: with: path: llvm ref: ${{ inputs.ref || github.sha }} - merge_ref: ${{ inputs.merge_ref }} cache_path: "D:\\\\github\\\\_work\\\\repo_cache\\\\" - name: Download compiler toolchain uses: actions/download-artifact@v4 diff --git a/devops/actions/cached_checkout/action.yml b/devops/actions/cached_checkout/action.yml index f2c25bcbca0a5..4eaac59cbcc12 100644 --- a/devops/actions/cached_checkout/action.yml +++ b/devops/actions/cached_checkout/action.yml @@ -10,7 +10,7 @@ inputs: description: | Commit-ish to merge post-checkout if non-empty. Must be reachable from the default_branch input paramter. - default: 'FETCH_HEAD' + default: '' path: description: 'Path to checkout repo to' fetch-depth: diff --git a/devops/actions/run-tests/e2e/action.yml b/devops/actions/run-tests/e2e/action.yml index 414f88d08c058..18ac10c490dbd 100644 --- a/devops/actions/run-tests/e2e/action.yml +++ b/devops/actions/run-tests/e2e/action.yml @@ -3,8 +3,6 @@ name: 'Run SYCL E2E tests' inputs: ref: required: false - merge_ref: - required: false e2e_binaries_artifact: required: false extra_cmake_args: @@ -31,7 +29,6 @@ runs: with: path: llvm ref: ${{ inputs.ref || github.sha }} - merge_ref: ${{ inputs.merge_ref }} cache_path: "/__w/repo_cache/" - name: Download E2E Binaries From ab25aa3222a5906cfd620e942bcf902e58c1cec8 Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Mon, 27 Jan 2025 08:20:57 +0200 Subject: [PATCH 13/45] [NFC][SYCL] Remove spurious underscore in comments (#16777) Based on the code for supported architectures the macro should start with only two underscores. --- .../sycl/ext/oneapi/experimental/device_architecture.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp index 4c940d9c6e3ef..e53255a7aa621 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp @@ -30,7 +30,7 @@ enum class architecture : uint64_t { // - new value for -fsycl-targets option to the compiler driver in // accordance with changes from sycl/doc/UsersManual.md and update the // compiler driver tests -// - ___SYCL_TARGET___ to the compiler driver and to all places below +// - __SYCL_TARGET___ to the compiler driver and to all places below // - the unique ID of the new architecture to the SYCL RT source code to // support querying the device architecture through // device::get_info From 3ff64285ec6ab6a016764949f6bd8bd9182bde17 Mon Sep 17 00:00:00 2001 From: Yury Plyakhin Date: Mon, 27 Jan 2025 00:57:02 -0800 Subject: [PATCH 14/45] [SYCL][E2E][Joint Matrix] Add half colmajor A colmajor B test (#16683) - Add test for half colmajor A, colmajor B load for 8x16x16 - Refactor to get rid of unnecessary SG32-specific file --- ...nt_matrix_bfloat16_colmajorA_colmajorB.cpp | 22 ----- ...oint_matrix_16bit_colmajorA_colmajorB.cpp} | 89 ++++++++++++++----- ...nt_matrix_bfloat16_colmajorA_colmajorB.cpp | 20 ----- 3 files changed, 66 insertions(+), 65 deletions(-) delete mode 100644 sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp rename sycl/test-e2e/Matrix/{Inputs/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp => joint_matrix_16bit_colmajorA_colmajorB.cpp} (60%) delete mode 100644 sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp deleted file mode 100644 index f4f2e1719dd6c..0000000000000 --- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ /dev/null @@ -1,22 +0,0 @@ -//==-- joint_matrix_bfloat16_colmajorA_colmajorB.cpp - DPC++ joint_matrix--==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix -// REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943 - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -// This tests support of col major layout for matrix B which does transpose and -// then VNNI transform. This is currently only available on AMX - -// XFAIL: gpu -// XFAIL-TRACKER: GSD-5768 - -#include "common.hpp" -#define SG_SZ 32 -#include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" diff --git a/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_16bit_colmajorA_colmajorB.cpp similarity index 60% rename from sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp rename to sycl/test-e2e/Matrix/joint_matrix_16bit_colmajorA_colmajorB.cpp index bab88721fb1a9..edd24c2f6e248 100644 --- a/sycl/test-e2e/Matrix/Inputs/joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp +++ b/sycl/test-e2e/Matrix/joint_matrix_16bit_colmajorA_colmajorB.cpp @@ -1,4 +1,4 @@ -//==-joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp- DPC++ joint_matrix-==// +//==-joint_matrix_16bit_colmajorA_colmajorB.cpp- DPC++ joint_matrix-==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,27 +6,46 @@ // //===----------------------------------------------------------------------===// +// This tests support of col major layout for matrix B which does transpose and +// then VNNI transform. This is currently only available on AMX + +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// RUN: %{build} -o %t32.out -DSG_SZ=32 +// RUN: %{run} %t32.out + +// XFAIL: gpu +// XFAIL-TRACKER: GSD-5768 + +#include "common.hpp" + constexpr size_t TM = 8; constexpr size_t TN = 16; constexpr size_t TK = 16; +template class imatrix; + template void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { size_t NDRangeM = M / TM; size_t NDRangeN = N / TN; - buffer bufA(A.get_data(), range<2>(M, K)); - buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); buffer bufC((float *)C.get_data(), range<2>(M, N)); queue q; - size_t sg_size = get_sg_size(q); + size_t sg_size = get_sg_size>(q); + std::cout << "subgroup size " << sg_size << " "; + q.submit([&](handler &cgh) { auto accC = bufC.get_access(cgh); - auto accA = bufA.get_access(cgh); - auto accB = bufB.get_access(cgh); + auto accA = bufA.template get_access(cgh); + auto accB = bufB.template get_access(cgh); - cgh.parallel_for( + cgh.parallel_for>( nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}), [=](nd_item<2> spmd_item) #ifdef SG_SZ @@ -42,10 +61,8 @@ void matrix_multiply(big_matrix &C, big_matrix &A, const auto sg_starty = global_idy - spmd_item.get_local_id(1); sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; - joint_matrix - sub_b; + joint_matrix sub_a; + joint_matrix sub_b; joint_matrix sub_c; joint_matrix_load( @@ -75,31 +92,57 @@ void matrix_multiply(big_matrix &C, big_matrix &A, }).wait(); } -int main() { +template void test() { static constexpr size_t MATRIX_M = TM * 2; static constexpr size_t MATRIX_N = TN * 2; static constexpr size_t MATRIX_K = TK * 2; - bfloat16 A[MATRIX_K][MATRIX_M]; - bfloat16 B[MATRIX_N][MATRIX_K]; + T A[MATRIX_K][MATRIX_M]; + T B[MATRIX_N][MATRIX_K]; float C[MATRIX_M][MATRIX_N]; float D[MATRIX_M][MATRIX_N]; - matrix_fill(MATRIX_K, MATRIX_M, (bfloat16 *)A, + matrix_fill(MATRIX_K, MATRIX_M, (T *)A, [](int i, int j) { return 1.0f * (i + j); }); - matrix_fill(MATRIX_N, MATRIX_K, (bfloat16 *)B, + matrix_fill(MATRIX_N, MATRIX_K, (T *)B, [](int i, int j) { return 2.0f * i + 3.0f * j; }); matrix_fill(MATRIX_M, MATRIX_N, (float *)C, 1.0f); matrix_fill(MATRIX_M, MATRIX_N, (float *)D, 1.0f); big_matrix MC((float *)&C); big_matrix MD((float *)&D); - big_matrix MA((bfloat16 *)&A); - big_matrix MB((bfloat16 *)&B); + big_matrix MA((T *)&A); + big_matrix MB((T *)&B); matrix_multiply(MC, MA, MB); - matrix_multiply_ref((bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, - MATRIX_N, MATRIX_K, false, true, true); + matrix_multiply_ref((T *)A, (T *)B, (float *)D, MATRIX_M, MATRIX_N, MATRIX_K, + false, true, true); + + assert(matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D)); + std::cout << "passed" << std::endl; +} + +int main() { + queue q; + std::vector combinations = + q.get_device().get_info(); + bool bf16_run = false; + bool half_run = false; + + for (auto &combination : combinations) { + if (!bf16_run && combination.atype == matrix_type::bf16) { + std::cout << "bf16 "; + test(); + bf16_run = true; + } + + if (!half_run && combination.atype == matrix_type::fp16) { + std::cout << "half "; + test(); + half_run = true; + } + + if (bf16_run && half_run) + break; + } - bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D); - std::cout << (res ? "passed" : "failed") << std::endl; - return !res; + return 0; } diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp deleted file mode 100644 index 2519b0fdb4c79..0000000000000 --- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_colmajorA_colmajorB.cpp +++ /dev/null @@ -1,20 +0,0 @@ -//==-- joint_matrix_bfloat16_colmajorA_colmajorB.cpp - DPC++ joint_matrix--==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: aspect-ext_intel_matrix - -// RUN: %{build} -o %t.out -// RUN: %{run} %t.out - -// This tests support of col major layout for matrix B which does transpose and -// then VNNI transform. This is currently only available on AMX - -// XFAIL: gpu -// XFAIL-TRACKER: GSD-5768 - -#include "common.hpp" -#include "joint_matrix_bfloat16_colmajorA_colmajorB_impl.hpp" From 80acd9b1f111c8ef7400a25dc52be58d7f33db15 Mon Sep 17 00:00:00 2001 From: przemektmalon Date: Mon, 27 Jan 2025 09:09:31 +0000 Subject: [PATCH 15/45] [SYCLCompat] Fix `compare_mask` implementations and test (#16768) The `compare_mask` and `unordered_compare_mask` implementations were placing the results of the comparison operations in the wrong 2-byte segments of the 4-byte output. The `math_compare.cpp` test has also been fixed, where the "expected" results were previously incorrect, they now reflect the values returned by the corresponding CUDA math functions. --- sycl/include/syclcompat/math.hpp | 8 ++++---- sycl/test-e2e/syclcompat/math/math_compare.cpp | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp index 148817770468c..eb82c98f445c1 100644 --- a/sycl/include/syclcompat/math.hpp +++ b/sycl/include/syclcompat/math.hpp @@ -598,8 +598,8 @@ template inline std::enable_if_t compare_mask(const ValueT a, const ValueT b, const BinaryOperation binary_op) { // Since compare returns 0 or 1, -compare will be 0x00000000 or 0xFFFFFFFF - return ((-compare(a[0], b[0], binary_op)) << 16) | - ((-compare(a[1], b[1], binary_op)) & 0xFFFF); + return ((-compare(a[0], b[0], binary_op)) & 0xFFFF) | + ((-compare(a[1], b[1], binary_op)) << 16u); } /// Performs 2 elements unordered comparison, compare result of each element is @@ -613,8 +613,8 @@ template inline std::enable_if_t unordered_compare_mask(const ValueT a, const ValueT b, const BinaryOperation binary_op) { - return ((-unordered_compare(a[0], b[0], binary_op)) << 16) | - ((-unordered_compare(a[1], b[1], binary_op)) & 0xFFFF); + return ((-unordered_compare(a[0], b[0], binary_op)) & 0xFFFF) | + ((-unordered_compare(a[1], b[1], binary_op)) << 16); } /// Compute vectorized max for two values, with each value treated as a vector diff --git a/sycl/test-e2e/syclcompat/math/math_compare.cpp b/sycl/test-e2e/syclcompat/math/math_compare.cpp index 0f77160a564e7..72e175d0361e2 100644 --- a/sycl/test-e2e/syclcompat/math/math_compare.cpp +++ b/sycl/test-e2e/syclcompat/math/math_compare.cpp @@ -301,12 +301,12 @@ typename ValueT> void test_compare_mask() { // 1.0 == 1.0, 2.0 == 3.0 -> 0xffff0000 BinaryOpTestLauncher(grid, threads) .template launch_test>(op1, op3, - 0xffff0000); + 0x0000ffff); // 1.0 == 3.0, 2.0 == 2.0 -> 0x0000ffff BinaryOpTestLauncher(grid, threads) .template launch_test>(op1, op4, - 0x0000ffff); + 0xffff0000); // 1.0 == NaN, 2.0 == NaN -> 0x00000000 BinaryOpTestLauncher(grid, threads) @@ -350,12 +350,12 @@ typename ValueT> void test_unordered_compare_mask() { // 1.0 == 1.0, 2.0 == 3.0 -> 0xffff0000 BinaryOpTestLauncher(grid, threads) .template launch_test>( - op1, op3, 0xffff0000); + op1, op3, 0x0000ffff); // 1.0 == 3.0, 2.0 == 2.0 -> 0x0000ffff BinaryOpTestLauncher(grid, threads) .template launch_test>( - op1, op4, 0x0000ffff); + op1, op4, 0xffff0000); // 1.0 == NaN, 2.0 == NaN -> 0xffffffff BinaryOpTestLauncher(grid, threads) From c7b3197c32c88fea4d3c8b596949f9e5a23c4c77 Mon Sep 17 00:00:00 2001 From: przemektmalon Date: Mon, 27 Jan 2025 14:09:16 +0000 Subject: [PATCH 16/45] [SYCL][Bindless][E2E] Test for host USM backed images (#16607) This patch adds a test for an image backed by a host USM allocation. Related UR PR: https://github.com/oneapi-src/unified-runtime/pull/2551 --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 12 +- .../bindless_images/sampling_2D_USM_host.cpp | 147 ++++++++++++++++++ 2 files changed, 153 insertions(+), 6 deletions(-) create mode 100644 sycl/test-e2e/bindless_images/sampling_2D_USM_host.cpp diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 5ba8a6aa938f1..7b46bd5b034c9 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit b841691699393dd2375e987c3d38d5f59c3e35cf -# Merge: c6859445 9de10cd9 +# commit 0bb6789f0113ea937d861fd67fd677b91ecdeb8b +# Merge: e370a2b9 eeff9f4a # Author: Kenneth Benzie (Benie) -# Date: Thu Jan 23 16:07:06 2025 +0000 -# Merge pull request #2559 from Bensuo/fix_kernel_arg_indices -# [CUDA][HIP] Fix kernel arguments being overwritten when added out of order -set(UNIFIED_RUNTIME_TAG b841691699393dd2375e987c3d38d5f59c3e35cf) +# Date: Mon Jan 27 10:40:02 2025 +0000 +# Merge pull request #2551 from przemektmalon/przemek/bindless-images-host-usm +# Enable creation of bindless images backed by host USM +set(UNIFIED_RUNTIME_TAG 0bb6789f0113ea937d861fd67fd677b91ecdeb8b) diff --git a/sycl/test-e2e/bindless_images/sampling_2D_USM_host.cpp b/sycl/test-e2e/bindless_images/sampling_2D_USM_host.cpp new file mode 100644 index 0000000000000..bca3d2c1c0ddd --- /dev/null +++ b/sycl/test-e2e/bindless_images/sampling_2D_USM_host.cpp @@ -0,0 +1,147 @@ +// REQUIRES: cuda +// REQUIRES: aspect-ext_oneapi_bindless_images_2d_usm + +// RUN: %{build} -o %t.out +// RUN: %{run-unfiltered-devices} %t.out + +#include +#include +#include + +#include +#include + +// Uncomment to print additional test information +// #define VERBOSE_PRINT + +class image_addition; + +int main() { + + sycl::device dev; + sycl::queue q(dev); + auto ctxt = q.get_context(); + + // declare image data + size_t width = 5; + size_t height = 6; + size_t N = width * height; + size_t widthInBytes = width * sizeof(float); + std::vector out(N); + std::vector expected(N); + std::vector dataIn(N); + + for (int i = 0; i < width; i++) { + for (int j = 0; j < height; j++) { + expected[i + (width * j)] = i + (width * j); + dataIn[i + (width * j)] = i + (width * j); + } + } + + try { + sycl::ext::oneapi::experimental::bindless_image_sampler samp( + sycl::addressing_mode::clamp, + sycl::coordinate_normalization_mode::normalized, + sycl::filtering_mode::linear); + + // Extension: image descriptor + sycl::ext::oneapi::experimental::image_descriptor desc( + {width, height}, 1, sycl::image_channel_type::fp32); + + auto devicePitchAlign = dev.get_info< + sycl::ext::oneapi::experimental::info::device::image_row_pitch_align>(); + auto deviceMaxPitch = + dev.get_info(); + + // Pitch requirements: + // - pitch % devicePitchAlign == 0 + // - pitch >= widthInBytes + // - pitch <= deviceMaxPitch + size_t pitch = devicePitchAlign * + std::ceil(float(widthInBytes) / float(devicePitchAlign)); + assert(pitch <= deviceMaxPitch); + + // Host USM allocation + auto imgMem = + sycl::aligned_alloc_host(devicePitchAlign, (pitch * height), ctxt); + + if (imgMem == nullptr) { + std::cerr << "Error allocating images!" << std::endl; + return 1; + } + + // Copy to host USM and incorporate pitch + for (size_t i = 0; i < height; i++) { + memcpy(static_cast(imgMem) + (i * pitch / sizeof(float)), + dataIn.data() + (i * width), widthInBytes); + } + + // Extension: create the image and return the handle + sycl::ext::oneapi::experimental::sampled_image_handle imgHandle = + sycl::ext::oneapi::experimental::create_image(imgMem, pitch, samp, desc, + dev, ctxt); + + sycl::buffer buf((float *)out.data(), + sycl::range<2>{height, width}); + q.submit([&](sycl::handler &cgh) { + auto outAcc = buf.get_access( + cgh, sycl::range<2>{height, width}); + + cgh.parallel_for( + sycl::nd_range<2>{{width, height}, {width, height}}, + [=](sycl::nd_item<2> it) { + size_t dim0 = it.get_local_id(0); + size_t dim1 = it.get_local_id(1); + + // Normalize coordinates -- +0.5 to look towards centre of pixel + float fdim0 = float(dim0 + 0.5f) / (float)width; + float fdim1 = float(dim1 + 0.5f) / (float)height; + + // Extension: sample image data from handle + float px = sycl::ext::oneapi::experimental::sample_image( + imgHandle, sycl::float2(fdim0, fdim1)); + + outAcc[sycl::id<2>{dim1, dim0}] = px; + }); + }); + + q.wait_and_throw(); + + // Extension: cleanup + sycl::ext::oneapi::experimental::destroy_image_handle(imgHandle, dev, ctxt); + sycl::free(imgMem, ctxt); + } catch (sycl::exception e) { + std::cerr << "SYCL exception caught! : " << e.what() << "\n"; + return 1; + } catch (...) { + std::cerr << "Unknown exception caught!\n"; + return 2; + } + + // collect and validate output + bool validated = true; + for (int i = 0; i < N; i++) { + bool mismatch = false; + if (out[i] != expected[i]) { + mismatch = true; + validated = false; + } + + if (mismatch) { +#ifdef VERBOSE_PRINT + std::cout << "Result mismatch! Expected: " << expected[i] + << ", Actual: " << out[i] << std::endl; +#else + break; +#endif + } + } + if (validated) { + std::cout << "Test passed!" << std::endl; + return 0; + } + + std::cout << "Test failed!" << std::endl; + return 3; +} From deb3c1c0afd86f3a9608eb691584658f4e559212 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Mon, 27 Jan 2025 06:59:27 -0800 Subject: [PATCH 17/45] [CI] Refactor build/run-only logic in `run-tests/e2e/action.yml` (#16780) Reduce number of input parameters and make the logic a bit cleaner (IMO). This PR also uses that updated logic to make building E2E tests optional in `sycl-linux-build.yml` and makes enabled in pre-commit only for now, effectively fixing the regression in Nightly CI introduced in https://github.com/intel/llvm/pull/16682. --- .github/workflows/sycl-linux-build.yml | 12 ++++++---- .github/workflows/sycl-linux-precommit.yml | 1 + .github/workflows/sycl-linux-run-tests.yml | 18 +++++---------- devops/actions/run-tests/e2e/action.yml | 26 ++++++++++------------ 4 files changed, 27 insertions(+), 30 deletions(-) diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml index 64ec651b44643..412e081eb9cb1 100644 --- a/.github/workflows/sycl-linux-build.yml +++ b/.github/workflows/sycl-linux-build.yml @@ -44,6 +44,9 @@ on: description: 'Artifacts retention period' type: string default: 3 + e2e_binaries_artifact: + type: string + required: False outputs: build_conclusion: @@ -247,12 +250,13 @@ jobs: retention-days: ${{ inputs.retention-days }} - name: Copy toolchain - if: ${{ always() && !cancelled() && steps.build.conclusion == 'success' }} + if: ${{ inputs.e2e_binaries_artifact && always() && !cancelled() && steps.build.conclusion == 'success' }} # We must have the compiler in the same location as it will be in the E2E # run-tests job. run: cp -r $GITHUB_WORKSPACE/build/install $GITHUB_WORKSPACE/toolchain - name: Source OneAPI TBB vars.sh + if: ${{ inputs.e2e_binaries_artifact && always() && !cancelled() && steps.build.conclusion == 'success' }} shell: bash run: | # https://github.com/actions/runner/issues/1964 prevents us from using @@ -270,11 +274,11 @@ jobs: rm env_before env_after - name: Build E2E tests - if: ${{ always() && !cancelled() && steps.build.conclusion == 'success' }} + if: ${{ inputs.e2e_binaries_artifact && always() && !cancelled() && steps.build.conclusion == 'success' }} uses: ./devops/actions/run-tests/e2e with: ref: ${{ inputs.ref || github.sha }} - e2e_testing_mode: build-only + testing_mode: build-only target_devices: all - artifact_suffix: default + binaries_artifact: ${{ inputs.e2e_binaries_artifact }} cxx_compiler: $GITHUB_WORKSPACE/toolchain/bin/clang++ diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index a4f36c4aeae23..025944b1d2f12 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -49,6 +49,7 @@ jobs: cc: clang cxx: clang++ changes: ${{ needs.detect_changes.outputs.filters }} + e2e_binaries_artifact: sycl_e2e_bin_default determine_arc_tests: name: Decide which Arc tests to run diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml index ca9645c947e25..2ede8a2eb2299 100644 --- a/.github/workflows/sycl-linux-run-tests.yml +++ b/.github/workflows/sycl-linux-run-tests.yml @@ -54,23 +54,18 @@ on: e2e_binaries_artifact: description: | - By setting this the E2E binaries folder will not be created, rather it - will be downloaded and extracted from the specified artifact. When - running tests in `run-only` mode this must be provided. + Must be set if `e2e_testing_mode` is equal to `run-only` and the + artifact must exist. Can be set in other modes resulting in artifact + upload. type: string default: '' required: False e2e_testing_mode: description: | Testing mode to run E2E tests in, can be either `full`, `build-only` - or `run-only`. In `build-only` mode an artifact of the E2E binaries - will be uploaded. + or `run-only`. type: string default: 'full' - artifact_suffix: - description: 'Suffix for E2E binaries artifact that is output when in `build-only`.' - type: string - default: 'default' retention-days: description: 'E2E/SYCL-CTS binaries artifact retention period.' type: string @@ -289,12 +284,11 @@ jobs: uses: ./devops/actions/run-tests/e2e with: ref: ${{ inputs.ref || github.sha }} - e2e_binaries_artifact: ${{ inputs.e2e_binaries_artifact }} + binaries_artifact: ${{ inputs.e2e_binaries_artifact }} + testing_mode: ${{ inputs.e2e_testing_mode }} extra_cmake_args: ${{ inputs.extra_cmake_args }} - e2e_testing_mode: ${{ inputs.e2e_testing_mode }} target_devices: ${{ inputs.target_devices }} extra_lit_opts: ${{ inputs.extra_lit_opts }} - artifact_suffix: ${{ inputs.artifact_suffix }} retention-days: ${{ inputs.retention-days }} - name: Run SYCL CTS Tests diff --git a/devops/actions/run-tests/e2e/action.yml b/devops/actions/run-tests/e2e/action.yml index 18ac10c490dbd..08c1e40fc2e18 100644 --- a/devops/actions/run-tests/e2e/action.yml +++ b/devops/actions/run-tests/e2e/action.yml @@ -3,18 +3,16 @@ name: 'Run SYCL E2E tests' inputs: ref: required: false - e2e_binaries_artifact: + binaries_artifact: required: false + testing_mode: + required: true extra_cmake_args: required: false - e2e_testing_mode: - required: true target_devices: required: true extra_lit_opts: required: false - artifact_suffix: - required: false retention-days: required: false cxx_compiler: @@ -32,19 +30,19 @@ runs: cache_path: "/__w/repo_cache/" - name: Download E2E Binaries - if: inputs.e2e_binaries_artifact != '' + if: inputs.testing_mode == 'run-only' uses: actions/download-artifact@v4 with: - name: ${{ inputs.e2e_binaries_artifact }} + name: ${{ inputs.binaries_artifact }} - name: Extract E2E Binaries - if: inputs.e2e_binaries_artifact != '' + if: inputs.testing_mode == 'run-only' shell: bash run: | mkdir build-e2e tar -I 'zstd' -xf e2e_binaries.tar.zst -C build-e2e - name: Deduce E2E CMake options - if: inputs.e2e_binaries_artifact == '' + if: inputs.testing_mode != 'run-only' id: cmake_opts shell: bash env: @@ -54,14 +52,14 @@ runs: echo "opts=$CMAKE_EXTRA_ARGS" >> $GITHUB_OUTPUT fi - name: Configure E2E tests - if: inputs.e2e_binaries_artifact == '' + if: inputs.testing_mode != 'run-only' shell: bash run: | cmake -GNinja -B./build-e2e -S./llvm/sycl/test-e2e -DCMAKE_CXX_COMPILER="${{ inputs.cxx_compiler || '$(which clang++)'}}" -DLLVM_LIT="$PWD/llvm/llvm/utils/lit/lit.py" ${{ steps.cmake_opts.outputs.opts }} - name: SYCL End-to-end tests shell: bash {0} env: - LIT_OPTS: -v --no-progress-bar --show-unsupported --show-pass --show-xfail --max-time 3600 --time-tests --param test-mode=${{ inputs.e2e_testing_mode }} --param sycl_devices=${{ inputs.target_devices }} ${{ inputs.extra_lit_opts }} + LIT_OPTS: -v --no-progress-bar --show-unsupported --show-pass --show-xfail --max-time 3600 --time-tests --param test-mode=${{ inputs.testing_mode }} --param sycl_devices=${{ inputs.target_devices }} ${{ inputs.extra_lit_opts }} run: | ninja -C build-e2e check-sycl-e2e > e2e.log 2>&1 exit_code=$? @@ -72,14 +70,14 @@ runs: exit $exit_code - name: Pack E2E binaries - if: ${{ always() && !cancelled() && inputs.e2e_testing_mode == 'build-only'}} + if: ${{ always() && !cancelled() && inputs.binaries_artifact != '' && inputs.testing_mode != 'run-only'}} shell: bash run: | tar -I 'zstd -9' -cf e2e_binaries.tar.zst -C ./build-e2e . - name: Upload E2E binaries - if: ${{ always() && !cancelled() && inputs.e2e_testing_mode == 'build-only'}} + if: ${{ always() && !cancelled() && inputs.binaries_artifact != '' && inputs.testing_mode != 'run-only'}} uses: actions/upload-artifact@v4 with: - name: sycl_e2e_bin_${{ inputs.artifact_suffix }} + name: ${{ inputs.binaries_artifact }} path: e2e_binaries.tar.zst retention-days: ${{ inputs.retention-days }} From 4f829bdbf54bfbbb4450735f69e57fdba950fdd8 Mon Sep 17 00:00:00 2001 From: Dmitry Sidorov Date: Mon, 27 Jan 2025 16:34:30 +0100 Subject: [PATCH 18/45] [SYCL] Optimize back-to-back ControlBarrier calls (#16750) This pass removes redundant __spirv_ControlBarrier call (as well as ITT annotations surrounding it) in case if it's neighboring another __spirv_ControlBarrier call with the same memory scope and memory semantics arguments. If the calls have different execution scope arguments - then pick the one with the 'bigger' scope. --------- Signed-off-by: Sidorov, Dmitry --- .../SYCLOptimizeBackToBackBarrier.h | 29 ++++ llvm/lib/Passes/PassBuilder.cpp | 1 + llvm/lib/Passes/PassRegistry.def | 1 + llvm/lib/SYCLLowerIR/CMakeLists.txt | 1 + .../SYCLOptimizeBackToBackBarrier.cpp | 160 ++++++++++++++++++ .../remove-back-to-back-barrier.ll | 99 +++++++++++ 6 files changed, 291 insertions(+) create mode 100644 llvm/include/llvm/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.h create mode 100644 llvm/lib/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.cpp create mode 100644 llvm/test/SYCLLowerIR/SYCLOptimizeBackToBackBarrier/remove-back-to-back-barrier.ll diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.h b/llvm/include/llvm/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.h new file mode 100644 index 0000000000000..7ea93f928d4c2 --- /dev/null +++ b/llvm/include/llvm/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.h @@ -0,0 +1,29 @@ +//==- SYCLOptimizeBackToBackBarrier.h - SYCLOptimizeBackToBackBarrier Pass -==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass cleans up back-to-back ControlBarrier calls. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_SYCL_OPTIMIZE_BACK_TO_BACK_BARRIER_H +#define LLVM_SYCL_OPTIMIZE_BACK_TO_BACK_BARRIER_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class SYCLOptimizeBackToBackBarrierPass + : public PassInfoMixin { +public: + PreservedAnalyses run(Module &M, ModuleAnalysisManager &); + + static bool isRequired() { return true; } +}; + +} // namespace llvm + +#endif // LLVM_SYCL_OPTIMIZE_BACK_TO_BACK_BARRIER_H diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 922fb8524b23b..c1d8b266946dd 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -163,6 +163,7 @@ #include "llvm/SYCLLowerIR/SYCLConditionalCallOnDevice.h" #include "llvm/SYCLLowerIR/SYCLCreateNVVMAnnotations.h" #include "llvm/SYCLLowerIR/SYCLJointMatrixTransform.h" +#include "llvm/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.h" #include "llvm/SYCLLowerIR/SYCLPropagateAspectsUsage.h" #include "llvm/SYCLLowerIR/SYCLPropagateJointMatrixUsage.h" #include "llvm/SYCLLowerIR/SYCLVirtualFunctionsAnalysis.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 7a0637da38261..ce2365b6ff45b 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -173,6 +173,7 @@ MODULE_PASS("esimd-remove-host-code", ESIMDRemoveHostCodePass()); MODULE_PASS("esimd-remove-optnone-noinline", ESIMDRemoveOptnoneNoinlinePass()); MODULE_PASS("sycl-conditional-call-on-device", SYCLConditionalCallOnDevicePass()) MODULE_PASS("sycl-joint-matrix-transform", SYCLJointMatrixTransformPass()) +MODULE_PASS("sycl-optimize-back-to-back-barrier", SYCLOptimizeBackToBackBarrierPass()) MODULE_PASS("sycl-propagate-aspects-usage", SYCLPropagateAspectsUsagePass()) MODULE_PASS("sycl-propagate-joint-matrix-usage", SYCLPropagateJointMatrixUsagePass()) MODULE_PASS("sycl-add-opt-level-attribute", SYCLAddOptLevelAttributePass()) diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt index c9bd07eb7e78e..66294ccdb0439 100644 --- a/llvm/lib/SYCLLowerIR/CMakeLists.txt +++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt @@ -65,6 +65,7 @@ add_llvm_component_library(LLVMSYCLLowerIR SYCLDeviceRequirements.cpp SYCLKernelParamOptInfo.cpp SYCLJointMatrixTransform.cpp + SYCLOptimizeBackToBackBarrier.cpp SYCLPropagateAspectsUsage.cpp SYCLPropagateJointMatrixUsage.cpp SYCLVirtualFunctionsAnalysis.cpp diff --git a/llvm/lib/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.cpp b/llvm/lib/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.cpp new file mode 100644 index 0000000000000..e7973dd48212f --- /dev/null +++ b/llvm/lib/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.cpp @@ -0,0 +1,160 @@ +//=== SYCLOptimizeBackToBackBarrier.cpp - SYCL barrier optimization pass ===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass cleans up back-to-back ControlBarrier calls. +// +//===----------------------------------------------------------------------===// + +#include "llvm/SYCLLowerIR/SYCLOptimizeBackToBackBarrier.h" + +#include "llvm/IR/IRBuilder.h" + +using namespace llvm; + +namespace { + +static constexpr char CONTROL_BARRIER[] = "_Z22__spirv_ControlBarrieriii"; +static constexpr char ITT_BARRIER[] = "__itt_offload_wg_barrier_wrapper"; +static constexpr char ITT_RESUME[] = "__itt_offload_wi_resume_wrapper"; + +// Known scopes in SPIR-V. +enum class Scope { + CrossDevice = 0, + Device = 1, + Workgroup = 2, + Subgroup = 3, + Invocation = 4 +}; + +enum class CompareRes { BIGGER = 0, SMALLER = 1, EQUAL = 2, UNKNOWN = 3 }; + +// This map is added in case of any future scopes are added to SPIR-V and/or +// SYCL. +const std::unordered_map ScopeWeights = { + {static_cast(Scope::CrossDevice), 1000}, + {static_cast(Scope::Device), 800}, + {static_cast(Scope::Workgroup), 600}, + {static_cast(Scope::Subgroup), 400}, + {static_cast(Scope::Invocation), 10}}; + +inline CompareRes compareScopesWithWeights(const uint64_t LHS, + const uint64_t RHS) { + auto LHSIt = ScopeWeights.find(LHS); + auto RHSIt = ScopeWeights.find(RHS); + + if (LHSIt == ScopeWeights.end() || RHSIt == ScopeWeights.end()) + return CompareRes::UNKNOWN; + + const uint64_t LHSWeight = LHSIt->second; + const uint64_t RHSWeight = RHSIt->second; + + if (LHSWeight > RHSWeight) + return CompareRes::BIGGER; + if (LHSWeight < RHSWeight) + return CompareRes::SMALLER; + return CompareRes::EQUAL; +} + +// The function removes back-to-back ControlBarrier calls in case if they +// have the same memory scope and memory semantics arguments. When two +// back-to-back ControlBarriers are having different execution scope arguments - +// pick the one with the 'bigger' scope. +// It also cleans up ITT annotations surrounding the removed barrier call. +bool processControlBarrier(Function *F) { + BasicBlock *PrevBB = nullptr; + llvm::SmallPtrSet ToErase; + for (auto I = F->user_begin(), E = F->user_end(); I != E;) { + User *U = *I++; + auto *CI = dyn_cast(U); + if (!CI) + continue; + + // New basic block - new processing. + BasicBlock *CurrentBB = CI->getParent(); + if (CurrentBB != PrevBB) { + PrevBB = CurrentBB; + continue; + } + + llvm::SmallPtrSet ToEraseLocalITT; + BasicBlock::iterator It(CI); + // Iterate over the basic block storing back-to-back barriers and their ITT + // annotations into ToErase container. + while (It != CurrentBB->begin()) { + --It; + auto *Cand = dyn_cast(&*It); + if (!Cand) + break; + CallInst *CIToRemove = Cand; + StringRef CandName = Cand->getCalledFunction()->getName(); + if (CandName == ITT_RESUME || CandName == ITT_BARRIER) { + ToEraseLocalITT.insert(Cand); + continue; + } else if (CandName == CONTROL_BARRIER) { + bool EqualOps = true; + const auto *ExecutionScopeCI = CI->getOperand(0); + const auto *ExecutionScopeCand = Cand->getOperand(0); + if (ExecutionScopeCI != ExecutionScopeCand) { + if (isa(ExecutionScopeCI) && + isa(ExecutionScopeCand)) { + const auto ConstScopeCI = + cast(ExecutionScopeCI)->getZExtValue(); + const auto ConstScopeCand = + cast(ExecutionScopeCand)->getZExtValue(); + // Pick ControlBarrier with the 'bigger' execution scope. + const auto Compare = + compareScopesWithWeights(ConstScopeCI, ConstScopeCand); + if (Compare == CompareRes::SMALLER) + CIToRemove = CI; + else if (Compare == CompareRes::UNKNOWN) + // Unknown scopes = unknown rules. Keep ControlBarrier call. + EqualOps = false; + } else + EqualOps = false; + } + // TODO: may be handle a case with not-matching memory scope and + // memory semantic arguments in a smart way. + for (unsigned I = 1; I != CI->getNumOperands(); ++I) { + if (CI->getOperand(I) != Cand->getOperand(I)) { + EqualOps = false; + break; + } + } + if (EqualOps) { + ToErase.insert(CIToRemove); + for (auto *ITT : ToEraseLocalITT) + ToErase.insert(ITT); + ToEraseLocalITT.clear(); + } + } + } + } + + if (ToErase.empty()) + return false; + + for (auto *I : ToErase) { + I->dropAllReferences(); + I->eraseFromParent(); + } + + return true; +} + +} // namespace + +PreservedAnalyses +SYCLOptimizeBackToBackBarrierPass::run(Module &M, ModuleAnalysisManager &MAM) { + bool ModuleChanged = false; + for (Function &F : M) + if (F.isDeclaration()) + if (F.getName() == CONTROL_BARRIER) + ModuleChanged |= processControlBarrier(&F); + + return ModuleChanged ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/llvm/test/SYCLLowerIR/SYCLOptimizeBackToBackBarrier/remove-back-to-back-barrier.ll b/llvm/test/SYCLLowerIR/SYCLOptimizeBackToBackBarrier/remove-back-to-back-barrier.ll new file mode 100644 index 0000000000000..00edaefb9cc6c --- /dev/null +++ b/llvm/test/SYCLLowerIR/SYCLOptimizeBackToBackBarrier/remove-back-to-back-barrier.ll @@ -0,0 +1,99 @@ +; RUN: opt -passes=sycl-optimize-back-to-back-barrier -S < %s | FileCheck %s +; The test checks if back-to-back __spirv_ControlBarrier and ITT annotations are +; removed. + +; CHECK-LABEL: define spir_func void @_Z3fooii(i32 %[[#Scope1:]], i32 %[[#Scope2:]]) +; CHECK: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 1, i32 noundef 912) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 2, i32 noundef 912) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 64, i32 noundef 2, i32 noundef 912) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 %[[#Scope1]], i32 noundef 2, i32 noundef 912) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 %[[#Scope2]], i32 noundef 2, i32 noundef 912) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK-NEXT: ret void + +; CHECK-LABEL: define dso_local void @_Z3booi +; CHECK: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 0) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() +; CHECK: call spir_func void @__itt_offload_wg_barrier_wrapper() +; CHECK-NEXT: call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 0) +; CHECK-NEXT: call spir_func void @__itt_offload_wi_resume_wrapper() + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spirv64-unknown-unknown" + +define spir_func void @_Z3fooii(i32 %0, i32 %1) { + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 4, i32 noundef 1, i32 noundef 912) + call spir_func void @__itt_offload_wi_resume_wrapper() + + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 2, i32 noundef 1, i32 noundef 912) + call spir_func void @__itt_offload_wi_resume_wrapper() + + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 1, i32 noundef 912) + call spir_func void @__itt_offload_wi_resume_wrapper() + + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 1, i32 noundef 912) + call spir_func void @__itt_offload_wi_resume_wrapper() + + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 2, i32 noundef 912) + call spir_func void @__itt_offload_wi_resume_wrapper() + + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 2, i32 noundef 912) + call spir_func void @__itt_offload_wi_resume_wrapper() + + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 64, i32 noundef 2, i32 noundef 912) + call spir_func void @__itt_offload_wi_resume_wrapper() + + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 %0, i32 noundef 2, i32 noundef 912) + call spir_func void @__itt_offload_wi_resume_wrapper() + + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 %0, i32 noundef 2, i32 noundef 912) + call spir_func void @__itt_offload_wi_resume_wrapper() + + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 %1, i32 noundef 2, i32 noundef 912) + call spir_func void @__itt_offload_wi_resume_wrapper() + + ret void +} + +define dso_local void @_Z3booi(i32 noundef %0) local_unnamed_addr #0 { + %2 = icmp eq i32 %0, 0 + br i1 %2, label %3, label %4 + +3: ; preds = %1 + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 0) + call spir_func void @__itt_offload_wi_resume_wrapper() + br label %4 + +4: ; preds = %3, %1 + call spir_func void @__itt_offload_wg_barrier_wrapper() + call void @_Z22__spirv_ControlBarrieriii(i32 noundef 3, i32 noundef 3, i32 noundef 0) + call spir_func void @__itt_offload_wi_resume_wrapper() + ret void +} + +declare spir_func void @_Z22__spirv_ControlBarrieriii(i32 noundef, i32 noundef, i32 noundef) + +declare spir_func void @__itt_offload_wg_barrier_wrapper() + +declare spir_func void @__itt_offload_wi_resume_wrapper() From 367f35573e2d93651256486783b8f2c8c547cd8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9sz=C3=A1ros=20Gergely?= Date: Mon, 27 Jan 2025 17:28:39 +0100 Subject: [PATCH 19/45] [SYCL][E2E][NFC] Fix NameError if directive fails to parse (#16767) Currently if a test contains a malformed directive, the following exception is raised along with the original parsing error: ```plaintext Exception during script execution: (original error) During handling of the above exception, another exception occurred: Traceback (most recent call last): File "llvm/llvm/utils/lit/lit/worker.py", line 76, in _execute_test_handle_errors result = test.config.test_format.execute(test, lit_config) File "llvm/sycl/test-e2e/format.py", line 232, in execute script = self.parseTestScript(test) File "llvm/sycl/test-e2e/format.py", line 105, in parseTestScript return lit.Test.Result(Test.UNRESOLVED, str(e)) NameError: name 'Test' is not defined ``` The test ends up as UNRESOLVED either way, but fixing it is easy and improves the error message greatly. --- sycl/test-e2e/format.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py index a9e98a4ed8037..812fec75ab732 100644 --- a/sycl/test-e2e/format.py +++ b/sycl/test-e2e/format.py @@ -61,7 +61,7 @@ def parseTestScript(self, test): require_script=True, ) except ValueError as e: - return lit.Test.Result(Test.UNRESOLVED, str(e)) + return lit.Test.Result(lit.Test.UNRESOLVED, str(e)) script = parsed["RUN:"] or [] assert parsed["DEFINE:"] == script assert parsed["REDEFINE:"] == script From 566f5149ade9575c3ba8b8f634173972ffb04b13 Mon Sep 17 00:00:00 2001 From: Udit Kumar Agarwal Date: Mon, 27 Jan 2025 09:27:27 -0800 Subject: [PATCH 20/45] [CI] Install `pkg-config` in docker container (#16797) `pkg-config` is required for `llvm-spirv` to detect `spirv-tools` installation. See https://github.com/intel/llvm/pull/16743#issuecomment-2614079031 for more info. --- devops/scripts/install_build_tools.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/devops/scripts/install_build_tools.sh b/devops/scripts/install_build_tools.sh index 55cbc46aad449..a878f2807fd0e 100755 --- a/devops/scripts/install_build_tools.sh +++ b/devops/scripts/install_build_tools.sh @@ -30,7 +30,8 @@ apt update && apt install -yqq \ # To obtain latest release of spriv-tool. # Same as what's done in SPRIV-LLVM-TRANSLATOR: # https://github.com/KhronosGroup/SPIRV-LLVM-Translator/blob/cec12d6cf46306d0a015e883d5adb5a8200df1c0/.github/workflows/check-out-of-tree-build.yml#L59 +# pkg-config is required for llvm-spriv to detect spriv-tools installation. . /etc/os-release curl -L "https://packages.lunarg.com/lunarg-signing-key-pub.asc" | apt-key add - echo "deb https://packages.lunarg.com/vulkan $VERSION_CODENAME main" | tee -a /etc/apt/sources.list -apt update && apt install -yqq spirv-tools +apt update && apt install -yqq spirv-tools pkg-config From 4eb095e92984358ed4e52639ae15d83f549aebde Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Mon, 27 Jan 2025 09:34:35 -0800 Subject: [PATCH 21/45] [SYCL] remove experimental online_compiler extension (#16776) Minus support for "CM", all the functionality of the old experimental online_compiler extension is being provided by the new (also experimental) kernel_compiler. Only better. Besides always being merely experimental, the online_compiler has been marked as deprecated for over a year, it has become burdensome to continue to support, and it sometimes confuses users. The decision has been made to remove it, without waiting for an ABI breaking window. --- .../sycl_ext_intel_online_compiler.asciidoc | 208 ------------- .../sycl_ext_intel_online_compiler.asciidoc | 2 - .../intel/experimental/online_compiler.hpp | 274 ------------------ .../sycl/ext/intel/online_compiler.hpp | 20 -- sycl/source/CMakeLists.txt | 1 - .../kernel_compiler_opencl.cpp | 2 +- .../ocloc_api.h | 0 .../online_compiler/online_compiler.cpp | 267 ----------------- .../OnlineCompiler/online_compiler_L0.cpp | 79 ----- .../OnlineCompiler/online_compiler_OpenCL.cpp | 113 -------- .../OnlineCompiler/online_compiler_common.hpp | 193 ------------ sycl/test/abi/sycl_abi_neutrality_test.cpp | 4 - sycl/test/abi/sycl_symbols_linux.dump | 4 - sycl/test/abi/sycl_symbols_windows.dump | 30 +- .../test/basic_tests/no_math_in_global_ns.cpp | 1 - sycl/test/warnings/deprecated_headers.cpp | 5 +- sycl/test/warnings/sycl_2020_deprecations.cpp | 6 - 17 files changed, 15 insertions(+), 1194 deletions(-) delete mode 100644 sycl/doc/extensions/deprecated/sycl_ext_intel_online_compiler.asciidoc delete mode 100644 sycl/doc/extensions/experimental/sycl_ext_intel_online_compiler.asciidoc delete mode 100644 sycl/include/sycl/ext/intel/experimental/online_compiler.hpp delete mode 100644 sycl/include/sycl/ext/intel/online_compiler.hpp rename sycl/source/detail/{online_compiler => kernel_compiler}/ocloc_api.h (100%) delete mode 100644 sycl/source/detail/online_compiler/online_compiler.cpp delete mode 100644 sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp delete mode 100644 sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp delete mode 100644 sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp diff --git a/sycl/doc/extensions/deprecated/sycl_ext_intel_online_compiler.asciidoc b/sycl/doc/extensions/deprecated/sycl_ext_intel_online_compiler.asciidoc deleted file mode 100644 index 1b6fec92b033d..0000000000000 --- a/sycl/doc/extensions/deprecated/sycl_ext_intel_online_compiler.asciidoc +++ /dev/null @@ -1,208 +0,0 @@ -= SYCL Intel extension: Online Compilation -Konstantin Bobrovskii , John Pennycook -v0.1 -:source-highlighter: pygments -:icons: font -:dpcpp: pass:[DPC++] - -== Introduction -This document describes an interface for online compilation from high-level languages, such as -OpenCL, to a binary format, such as SPIR-V, loadable by SYCL backends. Unlike SYCL 2020 provisional's -OpenCL backend online compilation interface, this interface is not bound to any particular backend and does -not require available SYCL context for online compilation. - -This gives a flexibility to "cross-compile" to SPIR-V or other supported formats without any SYCL -device or context available. The online compilation API uses the `online_compiler` class to access -compilation services. Instances of the class are constructed based on a specification of the desired -compilation target passed to the constructors - such as compiled code format, target architecture, -etc. All the settings are optional, and by default the target is generic SPIR-V. - -This API is an Intel SYCL extension. - -== Status - -This extension has been deprecated. Although it is still supported in {dpcpp}, -we expect that the interfaces defined in this specification will be removed in -an upcoming {dpcpp} release. *Shipping software products should stop using -APIs defined in this specification and use an alternative instead.* - -== Online compilation API - -All online compilation API elements reside in the `sycl::INTEL` namespace. - -=== Source language specification - -Elements of the enum designate the source language: -[source,c++] ------------------ -enum class source_language { - opencl_c, // OpenCL C language - cm // Intel's C-for-Media language -}; ------------------ - -=== APIs to express compilation target characteristics - -The desired format of the compiled code: -[source,c++] ------------------ -enum class compiled_code_format { - spir_v -}; ------------------ - -Target device architecture: -[source,c++] ------------------ -class device_arch { -public: - static constexpr int any = 0; // designates an unspecified architecture - device_arch(int Val); - - // GPU architecture IDs - enum gpu { gpu_any = 1, ... }; - // CPU architecture IDs - enum cpu { cpu_any = 1, ... }; - // FPGA architecture IDs - enum fpga { fpga_any = 1, ... }; - - // Converts this architecture representation to an integer. - operator int(); -}; ------------------ - -=== Compiler API - -To compile a source, a user program must first construct an instance of the `sycl::ext::intel::online_compiler` class. Then pass the source as a `std::string` object to online compiler's `compile` function along with other relevant parameters. The `online_compiler` is templated by the source language, and the `compile` function is a variadic template function. Instantiations of the `online_compiler::compile` for different languages may have different sets of formal parameters. The `compile` function returns a binary blob - a `std::vector` - with the device code compiled according to the compilation target specification provided at online compiler construction time. - -==== Online compiler -[source,c++] ------------------ -template class online_compiler; ------------------ - -==== Compilation target specification elements. -[cols="40,60",options="header"] -|=== -|Element name and type |Description - -|`compiled_code_format` OutputFormat -|Compiled code format. - -|`std::pair` OutputFormatVersion -|Compiled code format version - a pair of "major" and "minor" components. - -|`sycl::info::device_type` DeviceType -|Target device type. - -|`device_arch` DeviceArch -|Target device architecture. - -|`bool` Is64Bit -|Whether the target device architecture is 64-bit. - -|`std::string` DeviceStepping -|Target device stepping (implementation defined). -|=== - -Online compiler construction or source compilation may be unsuccessful, in which case an instance -of `sycl::ext::intel::online_compile_error` is thrown. For example, when some of the compilation -target specification elements are not supported by the implementation, or there is a syntax error -in the source program. - - -==== `sycl::ext::intel::online_compiler` constructors. -[cols="40,60",options="header"] -|=== -|Constructor |Description - -|`online_compiler(compiled_code_format fmt = compiled_code_format::spir_v)` -| Constructs online compiler which can target any device and produces - given compiled code format. Produced device code is 64-bit. OutputFormatVersion is - implementation defined. The created compiler is "optimistic" - it assumes all applicable SYCL - device capabilities are supported by the target device(s). - -|`online_compiler( - sycl::info::device_type dev_type, - device_arch arch, - compiled_code_format fmt = compiled_code_format::spir_v)` -| Constructor version which allows to specify target device type and architecture. - -|`online_compiler(const sycl::device &dev)` -|Constructs online compiler for the target specified by given SYCL device. -|=== - -==== The compilation function - `online_compiler::compile` -It compiles given in-memory source to a binary blob. Blob format, -other parameters are set in the constructor. Specialization for each language will provide exact -signatures, which can be different for different languages.Throws `online_compile_error` if -compilation is not successful. -[source,c++] ------------------ -template - std::vector compile(const std::string &src, const Tys&... args); ------------------ - -Instantiations of the compilation function: -[source,c++] ------------------ -/// Compiles given OpenCL source. May throw \c online_compile_error. -/// @param src - contents of the source -/// @param options - compilation options (implementation defined); standard -/// OpenCL JIT compiler options must be supported -template <> -template <> -std::vector online_compiler::compile( - const std::string &src, const std::vector &options); - -/// Compiles given CM source. -template <> -template <> -std::vector online_compiler::compile( - const std::string &src); - -/// Compiles given CM source. -/// @param options - compilation options (implementation defined) -template <> -template <> -std::vector online_compiler::compile( - const std::string &src, const std::vector &options); ------------------ - -== API usage example -This example compiles an OpenCL source to a generic SPIR-V. -[source,c++] ------------------ -#include "sycl/ext/intel/online_compiler.hpp" - -#include -#include - -static const char *kernelSource = R"===( -__kernel void my_kernel(__global int *in, __global int *out) { - size_t i = get_global_id(0); - out[i] = in[i] + 1; -} -)==="; - -using namespace sycl::INTEL; - -int main(int argc, char **argv) { - online_compiler compiler; - std::vector blob; - - try { - blob = compiler.compile( - std::string(kernelSource), - std::vector { - std::string("-cl-fast-relaxed-math") - } - ); - } - catch (online_compile_error &e) { - std::cout << "compilation failed\n"; - return 1; - } - return 0; -} ------------------ diff --git a/sycl/doc/extensions/experimental/sycl_ext_intel_online_compiler.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_intel_online_compiler.asciidoc deleted file mode 100644 index 525ad49f90ad1..0000000000000 --- a/sycl/doc/extensions/experimental/sycl_ext_intel_online_compiler.asciidoc +++ /dev/null @@ -1,2 +0,0 @@ -This extension has been deprecated, but the specification is still available -link:../deprecated/sycl_ext_intel_online_compiler.asciidoc[here]. \ No newline at end of file diff --git a/sycl/include/sycl/ext/intel/experimental/online_compiler.hpp b/sycl/include/sycl/ext/intel/experimental/online_compiler.hpp deleted file mode 100644 index 365265e762c86..0000000000000 --- a/sycl/include/sycl/ext/intel/experimental/online_compiler.hpp +++ /dev/null @@ -1,274 +0,0 @@ -//===------- online_compiler.hpp - Online source compilation service ------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include -#include // for __SYCL_EXPORT -#include - -#include -#include - -namespace sycl { -inline namespace _V1 { -namespace ext::intel::experimental { -namespace detail { -using namespace sycl::detail; -} - -using byte = unsigned char; - -enum class compiled_code_format { - spir_v = 0 // the only format supported for now -}; - -class device_arch { -public: - static constexpr int any = 0; - - device_arch(int Val) : Val(Val) {} - - // TODO1: the list must be extended with a bunch of new GPUs available. - // TODO2: the list of supported GPUs grows rapidly. - // The API must allow user to define the target GPU option even if it is - // not listed in this enumerator below. - enum gpu { - gpu_any = 1, - gpu_gen9 = 2, - gpu_skl = gpu_gen9, - gpu_gen9_5 = 3, - gpu_kbl = gpu_gen9_5, - gpu_cfl = gpu_gen9_5, - gpu_gen11 = 4, - gpu_icl = gpu_gen11, - gpu_gen12 = 5, - gpu_tgl = gpu_gen12, - gpu_tgllp = gpu_gen12 - }; - - enum cpu { - cpu_any = 1, - }; - - enum fpga { - fpga_any = 1, - }; - - operator int() { return Val; } - -private: - int Val; -}; - -/// Represents an error happend during online compilation. -class online_compile_error : public sycl::exception { -public: - online_compile_error() = default; - online_compile_error(const std::string &Msg) - : sycl::exception(make_error_code(errc::invalid), Msg) {} -}; - -/// Designates a source language for the online compiler. -enum class source_language { opencl_c = 0, cm = 1 }; - -/// Represents an online compiler for the language given as template -/// parameter. -template -class __SYCL2020_DEPRECATED( - "experimental online_compiler is being deprecated. See " - "'sycl_ext_oneapi_kernel_compiler.asciidoc' instead for new kernel " - "compiler extension to kernel_bundle implementation.") online_compiler { -#if __INTEL_PREVIEW_BREAKING_CHANGES - // Refactor this during next ABI Breaking window. We have an `std::string` - // data member so cannot be accessing `this` when crossing ABI boundary. -#endif - __SYCL_EXPORT static std::vector - compile_impl(detail::string_view Src, - const std::vector &Options, - std::pair OutputFormatVersion, - sycl::info::device_type DeviceType, device_arch DeviceArch, - bool Is64Bit, detail::string_view DeviceStepping, - void *&CompileToSPIRVHandle, void *&FreeSPIRVOutputsHandle); - - std::vector compile_impl(const std::string &Source, - const std::vector &UserArgs) { - std::vector Args; - for (auto &&Arg : UserArgs) - Args.emplace_back(Arg); - - return compile_impl(std::string_view{Source}, Args, OutputFormatVersion, - DeviceType, DeviceArch, Is64Bit, - std::string_view{DeviceStepping}, CompileToSPIRVHandle, - FreeSPIRVOutputsHandle); - } - -public: - /// Constructs online compiler which can target any device and produces - /// given compiled code format. Produces 64-bit device code. - /// The created compiler is "optimistic" - it assumes all applicable SYCL - /// device capabilities are supported by the target device(s). - online_compiler(compiled_code_format fmt = compiled_code_format::spir_v) - : OutputFormat(fmt), OutputFormatVersion({0, 0}), - DeviceType(sycl::info::device_type::all), DeviceArch(device_arch::any), - Is64Bit(true), DeviceStepping("") {} - - /// Constructs online compiler which targets given architecture and produces - /// given compiled code format. Produces 64-bit device code. - /// Throws online_compile_error if values of constructor arguments are - /// contradictory or not supported - e.g. if the source language is not - /// supported for given device type. - online_compiler(sycl::info::device_type dev_type, device_arch arch, - compiled_code_format fmt = compiled_code_format::spir_v) - : OutputFormat(fmt), OutputFormatVersion({0, 0}), DeviceType(dev_type), - DeviceArch(arch), Is64Bit(true), DeviceStepping("") {} - - /// Constructs online compiler for the target specified by given SYCL device. - // TODO: the initial version generates the generic code (SKL now), need - // to do additional device::info calls to determine the device by it's - // features. - online_compiler(const sycl::device &) - : OutputFormat(compiled_code_format::spir_v), OutputFormatVersion({0, 0}), - DeviceType(sycl::info::device_type::all), DeviceArch(device_arch::any), - Is64Bit(true), DeviceStepping("") {} - - /// Compiles given in-memory \c Lang source to a binary blob. Blob format, - /// other parameters are set in the constructor by the compilation target - /// specification parameters. - /// Specialization for each language will provide exact signatures, which - /// can be different for different languages. - /// Throws online_compile_error if compilation is not successful. - template - std::vector compile(const std::string &src, const Tys &...args); - - /// Sets the compiled code format of the compilation target and returns *this. - online_compiler &setOutputFormat(compiled_code_format fmt) { - OutputFormat = fmt; - return *this; - } - - /// Sets the compiled code format version of the compilation target and - /// returns *this. - online_compiler &setOutputFormatVersion(int major, int minor) { - OutputFormatVersion = {major, minor}; - return *this; - } - - /// Sets the device type of the compilation target and returns *this. - online_compiler &setTargetDeviceType(sycl::info::device_type type) { - DeviceType = type; - return *this; - } - - /// Sets the device architecture of the compilation target and returns *this. - online_compiler &setTargetDeviceArch(device_arch arch) { - DeviceArch = arch; - return *this; - } - - /// Makes the compilation target 32-bit and returns *this. - online_compiler &set32bitTarget() { - Is64Bit = false; - return *this; - }; - - /// Makes the compilation target 64-bit and returns *this. - online_compiler &set64bitTarget() { - Is64Bit = true; - return *this; - }; - - /// Sets implementation-defined target device stepping of the compilation - /// target and returns *this. - online_compiler &setTargetDeviceStepping(const std::string &id) { - DeviceStepping = id; - return *this; - } - -private: - /// Compiled code format. - compiled_code_format OutputFormat; - - /// Compiled code format version - a pair of "major" and "minor" components - std::pair OutputFormatVersion; - - /// Target device type - sycl::info::device_type DeviceType; - - /// Target device architecture - device_arch DeviceArch; - - /// Whether the target device architecture is 64-bit - bool Is64Bit; - - /// Target device stepping (implementation defined) - std::string DeviceStepping; - - /// Handles to helper functions used by the implementation. - void *CompileToSPIRVHandle = nullptr; - void *FreeSPIRVOutputsHandle = nullptr; -}; - -// Specializations of the online_compiler class and 'compile' function for -// particular languages and parameter types. - -/// Compiles the given OpenCL source. May throw \c online_compile_error. -/// @param src - contents of the source. -/// @param options - compilation options (implementation defined); standard -/// OpenCL JIT compiler options must be supported. -template <> -template <> -#if !defined(__SYCL_ONLINE_COMPILER_CPP) || \ - defined(__INTEL_PREVIEW_BREAKING_CHANGES) -inline -#else -__SYCL_EXPORT -#endif - std::vector - online_compiler::compile( - const std::string &src, const std::vector &options) { - return compile_impl(src, options); -} - -/// Compiles the given OpenCL source. May throw \c online_compile_error. -/// @param src - contents of the source. -template <> -template <> -std::vector -online_compiler::compile(const std::string &src) { - return compile(src, std::vector{}); -} - -/// Compiles the given CM source \p src. -/// @param src - contents of the source. -/// @param options - compilation options (implementation defined). -template <> -template <> -#if !defined(__SYCL_ONLINE_COMPILER_CPP) || \ - defined(__INTEL_PREVIEW_BREAKING_CHANGES) -inline -#else -__SYCL_EXPORT -#endif - std::vector - online_compiler::compile( - const std::string &src, const std::vector &options) { - return compile_impl(src, options); -} - -/// Compiles the given CM source \p src. -template <> -template <> -std::vector -online_compiler::compile(const std::string &src) { - return compile(src, std::vector{}); -} - -} // namespace ext::intel::experimental -} // namespace _V1 -} // namespace sycl diff --git a/sycl/include/sycl/ext/intel/online_compiler.hpp b/sycl/include/sycl/ext/intel/online_compiler.hpp deleted file mode 100644 index 7cc4a35cde6b2..0000000000000 --- a/sycl/include/sycl/ext/intel/online_compiler.hpp +++ /dev/null @@ -1,20 +0,0 @@ -//===------- online_compiler.hpp - Online source compilation service ------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#pragma once - -#include - -#if !defined(_MSC_VER) || defined(__clang__) -// MSVC doesn't support #warning and we cannot use other methods to report a -// warning from inside a system header (which SYCL is considered to be). -#warning sycl/ext/intel/online_compiler.hpp usage is deprecated, \ -include sycl/ext/intel/experimental/online_compiler.hpp instead -#endif - -#include diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index d9f8801d45ba5..b60e70662f058 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -282,7 +282,6 @@ set(SYCL_COMMON_SOURCES "detail/platform_impl.cpp" "detail/program_manager/program_manager.cpp" "detail/queue_impl.cpp" - "detail/online_compiler/online_compiler.cpp" "detail/os_util.cpp" "detail/persistent_device_code_cache.cpp" "detail/platform_util.cpp" diff --git a/sycl/source/detail/kernel_compiler/kernel_compiler_opencl.cpp b/sycl/source/detail/kernel_compiler/kernel_compiler_opencl.cpp index 5452bf40795dd..63907ff913dca 100644 --- a/sycl/source/detail/kernel_compiler/kernel_compiler_opencl.cpp +++ b/sycl/source/detail/kernel_compiler/kernel_compiler_opencl.cpp @@ -11,8 +11,8 @@ #include "kernel_compiler_opencl.hpp" -#include "../online_compiler/ocloc_api.h" #include "../split_string.hpp" +#include "ocloc_api.h" #include // strlen #include // for std::function diff --git a/sycl/source/detail/online_compiler/ocloc_api.h b/sycl/source/detail/kernel_compiler/ocloc_api.h similarity index 100% rename from sycl/source/detail/online_compiler/ocloc_api.h rename to sycl/source/detail/kernel_compiler/ocloc_api.h diff --git a/sycl/source/detail/online_compiler/online_compiler.cpp b/sycl/source/detail/online_compiler/online_compiler.cpp deleted file mode 100644 index 138fc880edb92..0000000000000 --- a/sycl/source/detail/online_compiler/online_compiler.cpp +++ /dev/null @@ -1,267 +0,0 @@ -//==----------- online_compiler.cpp ----------------------------------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#define __SYCL_ONLINE_COMPILER_CPP - -#include -#include -#include - -#include - -#include "ocloc_api.h" - -namespace sycl { -inline namespace _V1 { -namespace ext::intel::experimental { -namespace detail { - -using namespace sycl::detail; - -static std::vector -prepareOclocArgs(sycl::info::device_type DeviceType, device_arch DeviceArch, - bool Is64Bit, string_view DeviceStepping, - const std::string &UserArgs) { - std::vector Args = {"ocloc", "-q", "-spv_only", "-device"}; - - if (DeviceType == sycl::info::device_type::gpu) { - switch (DeviceArch) { - case device_arch::gpu_gen9: - Args.push_back("skl"); - break; - - case device_arch::gpu_gen9_5: - Args.push_back("cfl"); - break; - - case device_arch::gpu_gen11: - Args.push_back("icllp"); - break; - - case device_arch::gpu_gen12: - Args.push_back("tgllp"); - break; - - default: - Args.push_back("tgllp"); - } - } else { - // TODO: change that to generic device when ocloc adds support for it. - // For now "tgllp" is used as the option supported on all known GPU RT. - Args.push_back("tgllp"); - } - - if (DeviceStepping != "") { - Args.push_back("-revision_id"); - Args.push_back(DeviceStepping.data()); - } - - Args.push_back(Is64Bit ? "-64" : "-32"); - - if (UserArgs != "") { - Args.push_back("-options"); - Args.push_back(UserArgs.c_str()); - } - - return Args; -} - -/// Compiles the given source \p Source to SPIR-V IL and returns IL as a vector -/// of bytes. -/// @param Source - Either OpenCL or CM source code. -/// @param DeviceType - SYCL device type, e.g. cpu, gpu, accelerator, etc. -/// @param DeviceArch - More detailed info on the target device architecture. -/// @param Is64Bit - If set to true, specifies the 64-bit architecture. -/// Otherwise, 32-bit is assumed. -/// @param DeviceStepping - implementation specific target device stepping. -/// @param CompileToSPIRVHandle - Output parameter. It is set to the address -/// of the library function doing the compilation. -/// @param FreeSPIRVOutputsHandle - Output parameter. It is set to the address -/// of the library function freeing memory -/// allocated during the compilation. -/// @param UserArgs - User's options to ocloc compiler. -static std::vector -compileToSPIRV(string_view Src, sycl::info::device_type DeviceType, - device_arch DeviceArch, bool Is64Bit, string_view DeviceStepping, - void *&CompileToSPIRVHandle, void *&FreeSPIRVOutputsHandle, - const std::vector &UserArgs) { - std::string Source{Src.data()}; - - if (!CompileToSPIRVHandle) { -#ifdef __SYCL_RT_OS_WINDOWS - static const std::string OclocLibraryName = "ocloc64.dll"; -#else - static const std::string OclocLibraryName = "libocloc.so"; -#endif - auto CustomDeleter = [](void *StoredPtr) { - if (!StoredPtr) - return; - std::ignore = sycl::detail::ur::unloadOsLibrary(StoredPtr); - }; - std::unique_ptr OclocLibrary( - sycl::detail::ur::loadOsLibrary(OclocLibraryName), CustomDeleter); - if (!OclocLibrary) - throw online_compile_error("Cannot load ocloc library: " + - OclocLibraryName); - void *OclocVersionHandle = sycl::detail::ur::getOsLibraryFuncAddress( - OclocLibrary.get(), "oclocVersion"); - // The initial versions of ocloc library did not have the oclocVersion() - // function. Those versions had the same API as the first version of ocloc - // library having that oclocVersion() function. - int LoadedVersion = ocloc_version_t::OCLOC_VERSION_1_0; - if (OclocVersionHandle) { - decltype(::oclocVersion) *OclocVersionFunc = - reinterpret_cast(OclocVersionHandle); - LoadedVersion = OclocVersionFunc(); - } - // The loaded library with version (A.B) is compatible with expected API/ABI - // version (X.Y) used here if A == B and B >= Y. - int LoadedVersionMajor = LoadedVersion >> 16; - int LoadedVersionMinor = LoadedVersion & 0xffff; - int CurrentVersionMajor = ocloc_version_t::OCLOC_VERSION_CURRENT >> 16; - int CurrentVersionMinor = ocloc_version_t::OCLOC_VERSION_CURRENT & 0xffff; - if (LoadedVersionMajor != CurrentVersionMajor || - LoadedVersionMinor < CurrentVersionMinor) - throw online_compile_error( - std::string("Found incompatible version of ocloc library: (") + - std::to_string(LoadedVersionMajor) + "." + - std::to_string(LoadedVersionMinor) + - "). The supported versions are (" + - std::to_string(CurrentVersionMajor) + - ".N), where (N >= " + std::to_string(CurrentVersionMinor) + ")."); - - CompileToSPIRVHandle = sycl::detail::ur::getOsLibraryFuncAddress( - OclocLibrary.get(), "oclocInvoke"); - if (!CompileToSPIRVHandle) - throw online_compile_error("Cannot load oclocInvoke() function"); - FreeSPIRVOutputsHandle = sycl::detail::ur::getOsLibraryFuncAddress( - OclocLibrary.get(), "oclocFreeOutput"); - if (!FreeSPIRVOutputsHandle) { - CompileToSPIRVHandle = NULL; - throw online_compile_error("Cannot load oclocFreeOutput() function"); - } - OclocLibrary.release(); - } - - std::string CombinedUserArgs; - for (const auto &UserArg : UserArgs) { - if (UserArg == "") - continue; - if (CombinedUserArgs != "") - CombinedUserArgs = CombinedUserArgs + " " + UserArg; - else - CombinedUserArgs = UserArg; - } - std::vector Args = detail::prepareOclocArgs( - DeviceType, DeviceArch, Is64Bit, DeviceStepping, CombinedUserArgs); - - uint32_t NumOutputs = 0; - byte **Outputs = nullptr; - uint64_t *OutputLengths = nullptr; - char **OutputNames = nullptr; - - const byte *Sources[] = {reinterpret_cast(Source.c_str())}; - const char *SourceName = "main.cl"; - const uint64_t SourceLengths[] = {Source.length() + 1}; - - Args.push_back("-file"); - Args.push_back(SourceName); - - decltype(::oclocInvoke) *OclocInvokeFunc = - reinterpret_cast(CompileToSPIRVHandle); - int CompileError = - OclocInvokeFunc(Args.size(), Args.data(), 1, Sources, SourceLengths, - &SourceName, 0, nullptr, nullptr, nullptr, &NumOutputs, - &Outputs, &OutputLengths, &OutputNames); - - std::vector SpirV; - std::string CompileLog; - for (uint32_t I = 0; I < NumOutputs; I++) { - size_t NameLen = strlen(OutputNames[I]); - if (NameLen >= 4 && strstr(OutputNames[I], ".spv") != nullptr && - Outputs[I] != nullptr) { - assert(SpirV.size() == 0 && "More than one SPIR-V output found."); - SpirV = std::vector(Outputs[I], Outputs[I] + OutputLengths[I]); - } else if (!strcmp(OutputNames[I], "stdout.log")) { - CompileLog = std::string(reinterpret_cast(Outputs[I])); - } - } - - // Try to free memory before reporting possible error. - decltype(::oclocFreeOutput) *OclocFreeOutputFunc = - reinterpret_cast(FreeSPIRVOutputsHandle); - int MemFreeError = - OclocFreeOutputFunc(&NumOutputs, &Outputs, &OutputLengths, &OutputNames); - - if (CompileError) - throw online_compile_error("ocloc reported compilation errors: {\n" + - CompileLog + "\n}"); - if (SpirV.empty()) - throw online_compile_error( - "Unexpected output: ocloc did not return SPIR-V"); - if (MemFreeError) - throw online_compile_error("ocloc cannot safely free resources"); - - return SpirV; -} -} // namespace detail - -template -__SYCL_EXPORT std::vector online_compiler::compile_impl( - detail::string_view Src, const std::vector &Options, - std::pair OutputFormatVersion, sycl::info::device_type DeviceType, - device_arch DeviceArch, bool Is64Bit, detail::string_view DeviceStepping, - void *&CompileToSPIRVHandle, void *&FreeSPIRVOutputsHandle) { - - if (OutputFormatVersion != std::pair{0, 0}) { - std::string Version = std::to_string(OutputFormatVersion.first) + ", " + - std::to_string(OutputFormatVersion.second); - throw online_compile_error(std::string("The output format version (") + - Version + ") is not supported yet"); - } - - std::vector UserArgs; - for (auto &&Opt : Options) - UserArgs.emplace_back(Opt.data()); - - if constexpr (Lang == source_language::cm) - UserArgs.push_back("-cmc"); - - return detail::compileToSPIRV(Src, DeviceType, DeviceArch, Is64Bit, - DeviceStepping, CompileToSPIRVHandle, - FreeSPIRVOutputsHandle, UserArgs); -} - -template __SYCL_EXPORT std::vector -online_compiler::compile_impl( - detail::string_view Src, const std::vector &Options, - std::pair OutputFormatVersion, sycl::info::device_type DeviceType, - device_arch DeviceArch, bool Is64Bit, detail::string_view DeviceStepping, - void *&CompileToSPIRVHandle, void *&FreeSPIRVOutputsHandle); - -template __SYCL_EXPORT std::vector -online_compiler::compile_impl( - detail::string_view Src, const std::vector &Options, - std::pair OutputFormatVersion, sycl::info::device_type DeviceType, - device_arch DeviceArch, bool Is64Bit, detail::string_view DeviceStepping, - void *&CompileToSPIRVHandle, void *&FreeSPIRVOutputsHandle); -} // namespace ext::intel::experimental - -namespace ext { -namespace __SYCL2020_DEPRECATED( - "use 'ext::intel::experimental' instead") intel { -using namespace ext::intel::experimental; -} -} // namespace ext - -namespace __SYCL2020_DEPRECATED( - "use 'ext::intel::experimental' instead") INTEL { -using namespace ext::intel::experimental; -} -} // namespace _V1 -} // namespace sycl diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp b/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp deleted file mode 100644 index 4de91a66941aa..0000000000000 --- a/sycl/test-e2e/OnlineCompiler/online_compiler_L0.cpp +++ /dev/null @@ -1,79 +0,0 @@ -// REQUIRES: level_zero, level_zero_dev_kit, cm-compiler -// XFAIL: gpu -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16406 -// RUN: %{build} -Wno-error=deprecated-declarations -DRUN_KERNELS %level_zero_options -o %t.out -// RUN: %{run} %t.out - -// This test checks ext::intel feature class online_compiler for Level-Zero. -// All Level-Zero specific code is kept here and the common part that can be -// re-used by other backends is kept in online_compiler_common.hpp file. - -#include -#include - -#include - -// clang-format off -#include -#include -// clang-format on - -using byte = unsigned char; - -#ifdef RUN_KERNELS -bool testSupported(sycl::queue &Queue) { - return Queue.get_backend() == sycl::backend::ext_oneapi_level_zero; -} - -sycl::kernel getSYCLKernelWithIL(sycl::queue &Queue, - const std::vector &IL) { - - ze_module_desc_t ZeModuleDesc = {}; - ZeModuleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV; - ZeModuleDesc.inputSize = IL.size(); - ZeModuleDesc.pInputModule = IL.data(); - ZeModuleDesc.pBuildFlags = ""; - ZeModuleDesc.pConstants = nullptr; - - sycl::context Context = Queue.get_context(); - sycl::device Device = Queue.get_device(); - auto ZeDevice = - sycl::get_native(Device); - auto ZeContext = - sycl::get_native(Context); - - ze_module_build_log_handle_t ZeBuildLog; - ze_module_handle_t ZeModule; - ze_result_t ZeResult = zeModuleCreate(ZeContext, ZeDevice, &ZeModuleDesc, - &ZeModule, &ZeBuildLog); - assert(ZeResult == ZE_RESULT_SUCCESS); - - ze_kernel_handle_t ZeKernel = nullptr; - - ze_kernel_desc_t ZeKernelDesc{ZE_STRUCTURE_TYPE_KERNEL_DESC, nullptr, 0, - "my_kernel"}; - ZeResult = zeKernelCreate(ZeModule, &ZeKernelDesc, &ZeKernel); - assert(ZeResult == ZE_RESULT_SUCCESS); - sycl::kernel_bundle SyclKB = - sycl::make_kernel_bundle( - {ZeModule, sycl::ext::oneapi::level_zero::ownership::keep}, Context); - - auto Kernel = sycl::make_kernel( - {SyclKB, ZeKernel, sycl::ext::oneapi::level_zero::ownership::keep}, - Context); - - // Should not throw an exception - try { - auto num_args = Kernel.get_info(); - (void)num_args; - } catch (sycl::exception &e) { - assert(false && "Using \"info::kernel::num_args\" query for valid kernel " - "should not throw an exception."); - } - - return Kernel; -} -#endif // RUN_KERNELS - -#include "online_compiler_common.hpp" diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp b/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp deleted file mode 100644 index b0023426f0631..0000000000000 --- a/sycl/test-e2e/OnlineCompiler/online_compiler_OpenCL.cpp +++ /dev/null @@ -1,113 +0,0 @@ -// REQUIRES: opencl, opencl_icd, cm-compiler -// XFAIL: gpu || cpu || accelerator -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16406 -// RUN: %{build} -Wno-error=deprecated-declarations -DRUN_KERNELS %opencl_lib -o %t.out -// RUN: %{run} %t.out - -// This test checks ext::intel feature class online_compiler for OpenCL. -// All OpenCL specific code is kept here and the common part that can be -// re-used by other backends is kept in online_compiler_common.hpp file. - -#include -#include -#include -#include - -#include - -using byte = unsigned char; - -#ifdef RUN_KERNELS -std::tuple GetOCLVersion(sycl::device Device) { - cl_int Err; - cl_device_id ClDevice = sycl::get_native(Device); - - size_t VersionSize = 0; - Err = clGetDeviceInfo(ClDevice, CL_DEVICE_VERSION, 0, nullptr, &VersionSize); - assert(Err == CL_SUCCESS); - - std::string Version(VersionSize, '\0'); - Err = clGetDeviceInfo(ClDevice, CL_DEVICE_VERSION, VersionSize, - Version.data(), nullptr); - assert(Err == CL_SUCCESS); - - std::string_view Prefix = "OpenCL "; - size_t VersionBegin = Version.find_first_of(" "); - size_t VersionEnd = Version.find_first_of(" ", VersionBegin + 1); - size_t VersionSeparator = Version.find_first_of(".", VersionBegin + 1); - - bool HaveOCLPrefix = - std::equal(Prefix.begin(), Prefix.end(), Version.begin()); - - assert(HaveOCLPrefix && VersionBegin != std::string::npos && - VersionEnd != std::string::npos && - VersionSeparator != std::string::npos); - - std::string VersionMajor{Version.begin() + VersionBegin + 1, - Version.begin() + VersionSeparator}; - std::string VersionMinor{Version.begin() + VersionSeparator + 1, - Version.begin() + VersionEnd}; - - unsigned long OCLMajor = strtoul(VersionMajor.c_str(), nullptr, 10); - unsigned long OCLMinor = strtoul(VersionMinor.c_str(), nullptr, 10); - - assert(OCLMajor > 0 && (OCLMajor > 2 || OCLMinor <= 2) && - OCLMajor != UINT_MAX && OCLMinor != UINT_MAX); - - return std::make_tuple(OCLMajor, OCLMinor); -} - -bool testSupported(sycl::queue &Queue) { - if (Queue.get_backend() != sycl::backend::opencl) - return false; - - sycl::device Device = Queue.get_device(); - auto [OCLMajor, OCLMinor] = GetOCLVersion(Device); - - // Creating a program from IL is only supported on >=2.1 or if - // cl_khr_il_program is supported on the device. - return (OCLMajor == 2 && OCLMinor >= 1) || OCLMajor > 2 || - Device.has_extension("cl_khr_il_program"); -} - -sycl::kernel getSYCLKernelWithIL(sycl::queue &Queue, - const std::vector &IL) { - sycl::context Context = Queue.get_context(); - - cl_int Err = 0; - cl_program ClProgram = 0; - - sycl::device Device = Queue.get_device(); - auto [OCLMajor, OCLMinor] = GetOCLVersion(Device); - if ((OCLMajor == 2 && OCLMinor >= 1) || OCLMajor > 2) { - // clCreateProgramWithIL is supported if OCL version >=2.1. - ClProgram = - clCreateProgramWithIL(sycl::get_native(Context), - IL.data(), IL.size(), &Err); - } else { - // Fall back to using extension function for building IR. - using ApiFuncT = - cl_program(CL_API_CALL *)(cl_context, const void *, size_t, cl_int *); - ApiFuncT FuncPtr = - reinterpret_cast(clGetExtensionFunctionAddressForPlatform( - sycl::get_native(Context.get_platform()), - "clCreateProgramWithILKHR")); - - assert(FuncPtr != nullptr); - - ClProgram = FuncPtr(sycl::get_native(Context), - IL.data(), IL.size(), &Err); - } - assert(Err == CL_SUCCESS); - - Err = clBuildProgram(ClProgram, 0, nullptr, nullptr, nullptr, nullptr); - assert(Err == CL_SUCCESS); - - cl_kernel ClKernel = clCreateKernel(ClProgram, "my_kernel", &Err); - assert(Err == CL_SUCCESS); - - return sycl::make_kernel(ClKernel, Context); -} -#endif // RUN_KERNELS - -#include "online_compiler_common.hpp" diff --git a/sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp b/sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp deleted file mode 100644 index b585126f95674..0000000000000 --- a/sycl/test-e2e/OnlineCompiler/online_compiler_common.hpp +++ /dev/null @@ -1,193 +0,0 @@ -#include -#include - -#include -#include - -auto constexpr CLSource = R"===( -__kernel void my_kernel(__global int *in, __global int *out) { - size_t i = get_global_id(0); - out[i] = in[i]*2 + 100; -} -)==="; - -auto constexpr CLSourceSyntaxError = R"===( -__kernel void my_kernel(__global int *in, __global int *out) { - syntax error here - size_t i = get_global_id(0); - out[i] = in[i]*2 + 100; -} -)==="; - -auto constexpr CMSource = R"===( -extern "C" -void cm_kernel() { -} -)==="; - -using namespace sycl::ext::intel; - -#ifdef RUN_KERNELS -void testSyclKernel(sycl::queue &Q, sycl::kernel Kernel) { - std::cout << "Run the kernel now:\n"; - constexpr int N = 4; - int InputArray[N] = {0, 1, 2, 3}; - int OutputArray[N] = {}; - - sycl::buffer InputBuf(InputArray, sycl::range<1>(N)); - sycl::buffer OutputBuf(OutputArray, sycl::range<1>(N)); - - Q.submit([&](sycl::handler &CGH) { - CGH.set_arg(0, InputBuf.get_access(CGH)); - CGH.set_arg(1, OutputBuf.get_access(CGH)); - CGH.parallel_for(sycl::range<1>{N}, Kernel); - }); - - auto Out = OutputBuf.get_host_access(); - for (int I = 0; I < N; I++) - std::cout << I << "*2 + 100 = " << Out[I] << "\n"; -} -#endif // RUN_KERNELS - -int main(int argc, char **argv) { - sycl::queue Q; - sycl::device Device = Q.get_device(); - -#ifdef RUN_KERNELS - if (!testSupported(Q)) { - std::cout << "Building for IL is not supported. Skipping!" << std::endl; - return 0; - } -#endif - - { // Compile and run a trivial OpenCL kernel. - std::cout << "Test case1\n"; - sycl::ext::intel::experimental::online_compiler< - sycl::ext::intel::experimental::source_language::opencl_c> - Compiler; - std::vector IL; - try { - IL = Compiler.compile( - CLSource, - // Pass two options to check that more than one is accepted. - std::vector{"-cl-fast-relaxed-math", - "-cl-finite-math-only"}); - std::cout << "IL size = " << IL.size() << "\n"; - assert(IL.size() > 0 && "Unexpected IL size"); - } catch (sycl::exception &e) { - std::cout << "Compilation to IL failed: " << e.what() << "\n"; - return 1; - } -#ifdef RUN_KERNELS - testSyclKernel(Q, getSYCLKernelWithIL(Q, IL)); -#endif // RUN_KERNELS - } - - { // Compile and run a trivial OpenCL kernel using online_compiler() - // constructor accepting SYCL device. - std::cout << "Test case2\n"; - sycl::ext::intel::experimental::online_compiler< - sycl::ext::intel::experimental::source_language::opencl_c> - Compiler(Device); - std::vector IL; - try { - IL = Compiler.compile(CLSource); - std::cout << "IL size = " << IL.size() << "\n"; - assert(IL.size() > 0 && "Unexpected IL size"); - } catch (sycl::exception &e) { - std::cout << "Compilation to IL failed: " << e.what() << "\n"; - return 1; - } -#ifdef RUN_KERNELS - testSyclKernel(Q, getSYCLKernelWithIL(Q, IL)); -#endif // RUN_KERNELS - } - - // TODO: this test is temporarily turned off because CI buildbots do not set - // PATHs to clangFEWrapper library properly. - { // Compile a trivial CM kernel. - std::cout << "Test case3\n"; - sycl::ext::intel::experimental::online_compiler< - sycl::ext::intel::experimental::source_language::cm> - Compiler; - try { - std::vector IL = Compiler.compile(CMSource); - - std::cout << "IL size = " << IL.size() << "\n"; - assert(IL.size() > 0 && "Unexpected IL size"); - } catch (sycl::exception &e) { - std::cout << "Compilation to IL failed: " << e.what() << "\n"; - return 1; - } - } - - { // Compile a source with syntax errors. - std::cout << "Test case4\n"; - sycl::ext::intel::experimental::online_compiler< - sycl::ext::intel::experimental::source_language::opencl_c> - Compiler; - std::vector IL; - bool TestPassed = false; - try { - IL = Compiler.compile(CLSourceSyntaxError); - } catch (sycl::exception &e) { - std::string Msg = e.what(); - if (Msg.find("syntax error here") != std::string::npos) - TestPassed = true; - else - std::cerr << "Unexpected exception: " << Msg << "\n"; - } - assert(TestPassed && "Failed to throw an exception for syntax error"); - if (!TestPassed) - return 1; - } - - { // Compile a good CL source using unrecognized compilation options. - std::cout << "Test case5\n"; - sycl::ext::intel::experimental::online_compiler< - sycl::ext::intel::experimental::source_language::opencl_c> - Compiler; - std::vector IL; - bool TestPassed = false; - try { - IL = Compiler.compile(CLSource, - // Intentionally use incorrect option. - std::vector{"WRONG_OPTION"}); - } catch (sycl::exception &e) { - std::string Msg = e.what(); - if (Msg.find("WRONG_OPTION") != std::string::npos) - TestPassed = true; - else - std::cerr << "Unexpected exception: " << Msg << "\n"; - } - assert(TestPassed && - "Failed to throw an exception for unrecognized option"); - if (!TestPassed) - return 1; - } - - { // Try compiling CM source with OpenCL compiler. - std::cout << "Test case6\n"; - sycl::ext::intel::experimental::online_compiler< - sycl::ext::intel::experimental::source_language::opencl_c> - Compiler; - std::vector IL; - bool TestPassed = false; - try { - // Intentionally pass CMSource instead of CLSource. - IL = Compiler.compile(CMSource); - } catch (sycl::exception &e) { - std::string Msg = e.what(); - if (Msg.find("error: expected identifier or '('") != std::string::npos) - TestPassed = true; - else - std::cerr << "Unexpected exception: " << Msg << "\n"; - } - assert(TestPassed && "Failed to throw an exception for wrong program"); - if (!TestPassed) - return 1; - } - - std::cout << "\nAll test cases passed.\n"; - return 0; -} diff --git a/sycl/test/abi/sycl_abi_neutrality_test.cpp b/sycl/test/abi/sycl_abi_neutrality_test.cpp index 6920e60031dba..d1a4a8df9bd6c 100644 --- a/sycl/test/abi/sycl_abi_neutrality_test.cpp +++ b/sycl/test/abi/sycl_abi_neutrality_test.cpp @@ -19,10 +19,6 @@ // old entry points. Others were exported unnecessarily but only actually used // inside DSO, yet we have to keep the entry points as well. -// https://github.com/intel/llvm/pull/16179 -// CHECK:_ZN4sycl3_V13ext5intel12experimental15online_compilerILNS3_15source_languageE0EE7compileIJSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISE_EEEEES8_IhSaIhEERKSE_DpRKT_ -// CHECK:_ZN4sycl3_V13ext5intel12experimental15online_compilerILNS3_15source_languageE1EE7compileIJSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISE_EEEEES8_IhSaIhEERKSE_DpRKT_ -// // https://github.com/intel/llvm/pull/16178 // CHECK:_ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv // diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 98dee24572890..224dd494a6258 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2985,10 +2985,6 @@ _ZN4sycl3_V121__isgreaterequal_implEdd _ZN4sycl3_V121__isgreaterequal_implEff _ZN4sycl3_V122accelerator_selector_vERKNS0_6deviceE _ZN4sycl3_V128verifyUSMAllocatorPropertiesERKNS0_13property_listE -_ZN4sycl3_V13ext5intel12experimental15online_compilerILNS3_15source_languageE0EE12compile_implENS0_6detail11string_viewERKSt6vectorIS8_SaIS8_EESt4pairIiiENS0_4info11device_typeENS3_11device_archEbS8_RPvSK_ -_ZN4sycl3_V13ext5intel12experimental15online_compilerILNS3_15source_languageE0EE7compileIJSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISE_EEEEES8_IhSaIhEERKSE_DpRKT_ -_ZN4sycl3_V13ext5intel12experimental15online_compilerILNS3_15source_languageE1EE12compile_implENS0_6detail11string_viewERKSt6vectorIS8_SaIS8_EESt4pairIiiENS0_4info11device_typeENS3_11device_archEbS8_RPvSK_ -_ZN4sycl3_V13ext5intel12experimental15online_compilerILNS3_15source_languageE1EE7compileIJSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISE_EEEEES8_IhSaIhEERKSE_DpRKT_ _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE _ZN4sycl3_V13ext5intel12experimental9pipe_base18get_pipe_name_implEPKv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index edc11b37a071f..da02292f0028b 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -7,8 +7,6 @@ # REQUIRES: windows # UNSUPPORTED: libcxx -??$compile@V?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@@?$online_compiler@$00@experimental@intel@ext@_V1@sycl@@QEAA?AV?$vector@EV?$allocator@E@std@@@std@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@7@AEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@7@@Z -??$compile@V?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@std@@@?$online_compiler@$0A@@experimental@intel@ext@_V1@sycl@@QEAA?AV?$vector@EV?$allocator@E@std@@@std@@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@7@AEBV?$vector@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@V?$allocator@V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@2@@7@@Z ??$create_sub_devices@$0BAIG@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@_K@Z ??$create_sub_devices@$0BAIH@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@AEBV?$vector@_KV?$allocator@_K@std@@@4@@Z ??$create_sub_devices@$0BAII@@device@_V1@sycl@@QEBA?AV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@std@@W4partition_affinity_domain@info@12@@Z @@ -282,9 +280,9 @@ ??0SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z ??0SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z ??0SampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@V?$range@$02@23@PEAXHHV?$id@$02@23@W4image_channel_type@23@W4image_channel_order@23@Uimage_sampler@23@AEBVproperty_list@23@@Z -??0SubmissionInfo@detail@_V1@sycl@@QEAA@XZ -??0SubmissionInfo@detail@_V1@sycl@@QEAA@AEBV0123@@Z ??0SubmissionInfo@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z +??0SubmissionInfo@detail@_V1@sycl@@QEAA@AEBV0123@@Z +??0SubmissionInfo@detail@_V1@sycl@@QEAA@XZ ??0UnsampledImageAccessorBaseHost@detail@_V1@sycl@@IEAA@AEBV?$shared_ptr@VUnsampledImageAccessorImplHost@detail@_V1@sycl@@@std@@@Z ??0UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@$$QEAV0123@@Z ??0UnsampledImageAccessorBaseHost@detail@_V1@sycl@@QEAA@AEBV0123@@Z @@ -330,18 +328,12 @@ ??0device_image_plain@detail@_V1@sycl@@QEAA@AEBV?$shared_ptr@Vdevice_image_impl@detail@_V1@sycl@@@std@@@Z ??0device_selector@_V1@sycl@@QEAA@AEBV012@@Z ??0device_selector@_V1@sycl@@QEAA@XZ +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z +??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV?$command_graph@$0A@@12345@AEBV?$vector@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V?$allocator@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@@2@@std@@@Z ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV0123456@@Z ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV0123456@@Z ??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@V?$command_graph@$0A@@23456@_KPEBX@Z -?get_active_index@dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEBA_KXZ -??4dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z -?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@AEAVdynamic_command_group@34567@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z -??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV?$command_graph@$0A@@12345@AEBV?$vector@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V?$allocator@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@@2@@std@@@Z -??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@$$QEAV012345@@Z -??0dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@AEBV012345@@Z -?set_active_index@dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAX_K@Z -??1dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ -??4dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z ??0event@_V1@sycl@@AEAA@V?$shared_ptr@Vevent_impl@detail@_V1@sycl@@@std@@@Z ??0event@_V1@sycl@@QEAA@$$QEAV012@@Z ??0event@_V1@sycl@@QEAA@AEBV012@@Z @@ -479,6 +471,7 @@ ??1device@_V1@sycl@@QEAA@XZ ??1device_image_plain@detail@_V1@sycl@@QEAA@XZ ??1device_selector@_V1@sycl@@UEAA@XZ +??1dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1event@_V1@sycl@@QEAA@XZ ??1exception@_V1@sycl@@UEAA@XZ @@ -556,6 +549,8 @@ ??4device_image_plain@detail@_V1@sycl@@QEAAAEAV0123@$$QEAV0123@@Z ??4device_image_plain@detail@_V1@sycl@@QEAAAEAV0123@AEBV0123@@Z ??4device_selector@_V1@sycl@@QEAAAEAV012@AEBV012@@Z +??4dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@$$QEAV012345@@Z +??4dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV012345@AEBV012345@@Z ??4dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV0123456@$$QEAV0123456@@Z ??4dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAAAEAV0123456@AEBV0123456@@Z ??4event@_V1@sycl@@QEAAAEAV012@$$QEAV012@@Z @@ -654,8 +649,8 @@ ?GDBMethodsAnchor@UnsampledImageAccessorBaseHost@detail@_V1@sycl@@IEAAXXZ ?GetRangeRoundingSettings@handler@_V1@sycl@@AEAAXAEA_K00@Z ?HasAssociatedAccessor@handler@_V1@sycl@@AEBA_NPEAVAccessorImplHost@detail@23@W4target@access@23@@Z -?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEBAAEBV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ ?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEAAAEAV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ +?PostProcessorFunc@SubmissionInfo@detail@_V1@sycl@@QEBAAEBV?$optional@V?$function@$$A6AX_N0AEAVevent@_V1@sycl@@@Z@std@@@234@XZ ?PushBack@exception_list@_V1@sycl@@AEAAX$$QEAVexception_ptr@std@@@Z ?PushBack@exception_list@_V1@sycl@@AEAAXAEBVexception_ptr@std@@@Z ?RangeRoundingTrace@handler@_V1@sycl@@AEAA_NXZ @@ -3707,6 +3702,7 @@ ?addHostAccessorAndWait@detail@_V1@sycl@@YAXPEAVAccessorImplHost@123@@Z ?addHostSampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVSampledImageAccessorImplHost@123@@Z ?addHostUnsampledImageAccessorAndWait@detail@_V1@sycl@@YAXPEAVUnsampledImageAccessorImplHost@123@@Z +?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@AEAVdynamic_command_group@34567@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z ?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z ?addImpl@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@IEAA?AVnode@34567@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBV?$vector@Vnode@experimental@oneapi@ext@_V1@sycl@@V?$allocator@Vnode@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@@Z ?addLifetimeSharedPtrStorage@handler@_V1@sycl@@AEAAXV?$shared_ptr@$$CBX@std@@@Z @@ -3752,8 +3748,6 @@ ?checkNodePropertiesAndThrow@modifiable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@KAXAEBVproperty_list@67@@Z ?clearArgs@handler@_V1@sycl@@AEAAXXZ ?code@exception@_V1@sycl@@QEBAAEBVerror_code@std@@XZ -?compile_impl@?$online_compiler@$00@experimental@intel@ext@_V1@sycl@@CA?AV?$vector@EV?$allocator@E@std@@@std@@Vstring_view@detail@56@AEBV?$vector@Vstring_view@detail@_V1@sycl@@V?$allocator@Vstring_view@detail@_V1@sycl@@@std@@@8@U?$pair@HH@8@W4device_type@info@56@Vdevice_arch@23456@_N0AEAPEAX6@Z -?compile_impl@?$online_compiler@$0A@@experimental@intel@ext@_V1@sycl@@CA?AV?$vector@EV?$allocator@E@std@@@std@@Vstring_view@detail@56@AEBV?$vector@Vstring_view@detail@_V1@sycl@@V?$allocator@Vstring_view@detail@_V1@sycl@@@std@@@8@U?$pair@HH@8@W4device_type@info@56@Vdevice_arch@23456@_N0AEAPEAX6@Z ?compile_impl@detail@_V1@sycl@@YA?AV?$shared_ptr@Vkernel_bundle_impl@detail@_V1@sycl@@@std@@AEBV?$kernel_bundle@$0A@@23@AEBV?$vector@Vdevice@_V1@sycl@@V?$allocator@Vdevice@_V1@sycl@@@std@@@5@AEBVproperty_list@23@@Z ?complete_fusion@fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAA?AVevent@56@AEBVproperty_list@56@@Z ?computeFallbackKernelBounds@handler@_V1@sycl@@AEAA?AV?$id@$01@23@_K0@Z @@ -3847,8 +3841,8 @@ ?ext_oneapi_get_graph@queue@_V1@sycl@@QEBA?AV?$command_graph@$0A@@experimental@oneapi@ext@23@XZ ?ext_oneapi_get_kernel@kernel_bundle_plain@detail@_V1@sycl@@AEAA?AVkernel@34@Vstring_view@234@@Z ?ext_oneapi_get_kernel@kernel_bundle_plain@detail@_V1@sycl@@QEAA?AVkernel@34@AEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z -?ext_oneapi_get_last_event_impl@queue@_V1@sycl@@AEBA?AV?$optional@Vevent@_V1@sycl@@@detail@23@XZ ?ext_oneapi_get_last_event@queue@_V1@sycl@@QEBA?AV?$optional@Vevent@_V1@sycl@@@std@@XZ +?ext_oneapi_get_last_event_impl@queue@_V1@sycl@@AEBA?AV?$optional@Vevent@_V1@sycl@@@detail@23@XZ ?ext_oneapi_get_state@queue@_V1@sycl@@QEBA?AW4queue_state@experimental@oneapi@ext@23@XZ ?ext_oneapi_graph@handler@_V1@sycl@@QEAAXV?$command_graph@$00@experimental@oneapi@ext@23@@Z ?ext_oneapi_graph@queue@_V1@sycl@@QEAA?AVevent@23@V?$command_graph@$00@experimental@oneapi@ext@23@AEBUcode_location@detail@23@@Z @@ -4014,6 +4008,7 @@ ?getType@handler@_V1@sycl@@AEBA?AW4CGType@detail@23@XZ ?getValueFromDynamicParameter@detail@_V1@sycl@@YAPEAXAEAVdynamic_parameter_base@1experimental@oneapi@ext@23@@Z ?get_access_mode@experimental@oneapi@ext@_V1@sycl@@YA?AW4address_access_mode@12345@PEBX_KAEBVcontext@45@@Z +?get_active_index@dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEBA_KXZ ?get_addressing_mode@sampler@_V1@sycl@@QEBA?AW4addressing_mode@23@XZ ?get_allocator_internal@buffer_plain@detail@_V1@sycl@@IEBAAEBV?$unique_ptr@VSYCLMemObjAllocator@detail@_V1@sycl@@U?$default_delete@VSYCLMemObjAllocator@detail@_V1@sycl@@@std@@@std@@XZ ?get_allocator_internal@image_plain@detail@_V1@sycl@@IEBAAEBV?$unique_ptr@VSYCLMemObjAllocator@detail@_V1@sycl@@U?$default_delete@VSYCLMemObjAllocator@detail@_V1@sycl@@@std@@@std@@XZ @@ -4274,6 +4269,7 @@ ?setType@handler@_V1@sycl@@AEAAXW4CGType@detail@23@@Z ?setUserFacingNodeType@handler@_V1@sycl@@AEAAXW4node_type@experimental@oneapi@ext@23@@Z ?set_access_mode@experimental@oneapi@ext@_V1@sycl@@YAXPEBX_KW4address_access_mode@12345@AEBVcontext@45@@Z +?set_active_index@dynamic_command_group@experimental@oneapi@ext@_V1@sycl@@QEAAX_K@Z ?set_arg@handler@_V1@sycl@@QEAAXH$$QEAVraw_kernel_arg@experimental@oneapi@ext@23@@Z ?set_final_data_internal@buffer_plain@detail@_V1@sycl@@IEAAXAEBV?$function@$$A6AXAEBV?$function@$$A6AXPEAX@Z@std@@@Z@std@@@Z ?set_final_data_internal@buffer_plain@detail@_V1@sycl@@IEAAXXZ diff --git a/sycl/test/basic_tests/no_math_in_global_ns.cpp b/sycl/test/basic_tests/no_math_in_global_ns.cpp index c8f582ea45f59..25408c526c7aa 100644 --- a/sycl/test/basic_tests/no_math_in_global_ns.cpp +++ b/sycl/test/basic_tests/no_math_in_global_ns.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include using namespace sycl; diff --git a/sycl/test/warnings/deprecated_headers.cpp b/sycl/test/warnings/deprecated_headers.cpp index 1f6e8bc440877..726ea322d2a92 100644 --- a/sycl/test/warnings/deprecated_headers.cpp +++ b/sycl/test/warnings/deprecated_headers.cpp @@ -5,7 +5,4 @@ #include // expected-warning@sycl/backend/level_zero.hpp:16 {{sycl/backend/level_zero.hpp usage is deprecated, include sycl/ext/oneapi/backend/level_zero.hpp instead}} -#include - -// expected-warning@sycl/ext/intel/online_compiler.hpp:16 {{sycl/ext/intel/online_compiler.hpp usage is deprecated, include sycl/ext/intel/experimental/online_compiler.hpp instead}} -#include +#include \ No newline at end of file diff --git a/sycl/test/warnings/sycl_2020_deprecations.cpp b/sycl/test/warnings/sycl_2020_deprecations.cpp index 6fd61a0a1137e..d3af96fcf1cc5 100644 --- a/sycl/test/warnings/sycl_2020_deprecations.cpp +++ b/sycl/test/warnings/sycl_2020_deprecations.cpp @@ -2,7 +2,6 @@ // expected-warning@CL/sycl.hpp:* {{CL/sycl.hpp is deprecated, use sycl/sycl.hpp}} #include -#include int main() { cl_context ClCtx; @@ -88,11 +87,6 @@ int main() { // expected-warning@+1{{'exception' is deprecated: The version of an exception constructor which takes no arguments is deprecated.}} sycl::exception ex; - // expected-warning@+1{{'online_compiler' is deprecated}} - sycl::ext::intel::experimental::online_compiler< - sycl::ext::intel::experimental::source_language::opencl_c> - oc(Device); - Queue.submit([](sycl::handler &CGH) { // expected-warning@+3{{'nd_range' is deprecated: offsets are deprecated in SYCL2020}} // expected-warning@+2{{'nd_range' is deprecated: offsets are deprecated in SYCL2020}} From e923fbbb2d5307e4d72d3c1dbd699cf9edbafb81 Mon Sep 17 00:00:00 2001 From: Nick Sarnie Date: Tue, 28 Jan 2025 03:01:42 +0900 Subject: [PATCH 22/45] [CI] Update dev-igc even if IGC CI failed (#16790) IGC public CI has been failing for months and we got confirmation even if the public CI fails the change has already been internally validated. We can confirm this change works as below: [Old link](https://api.github.com/repos/intel/intel-graphics-compiler/actions/workflows/build-IGC.yml/runs?status=success) [New link](https://api.github.com/repos/intel/intel-graphics-compiler/actions/workflows/build-IGC.yml/runs) Signed-off-by: Sarnie, Nick --- devops/scripts/update_drivers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devops/scripts/update_drivers.py b/devops/scripts/update_drivers.py index e9b14f87d5572..c96d7dffed2d8 100644 --- a/devops/scripts/update_drivers.py +++ b/devops/scripts/update_drivers.py @@ -17,7 +17,7 @@ def get_latest_workflow_runs(repo, workflow_name): + repo + "/actions/workflows/" + workflow_name - + ".yml/runs?status=success" + + ".yml/runs" ).read() return json.loads(action_runs)["workflow_runs"][0] From c7cce7023ae5d6f485417a9cdb897dd756108922 Mon Sep 17 00:00:00 2001 From: Udit Kumar Agarwal Date: Mon, 27 Jan 2025 15:31:07 -0800 Subject: [PATCH 23/45] [SYCL][Test] Add regression test for `sycl::vec` compilation error in windows, debug (#16802) Test to isolate `sycl::vec` regression after https://github.com/intel/llvm/pull/14130. This PR caused `sycl::vec` to use `std::array` as its underlying storage. However, operations on `std::array` may emit debug-mode-only functions, on which the device compiler may fail. --- sycl/test/regression/vec_array_windows.cpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 sycl/test/regression/vec_array_windows.cpp diff --git a/sycl/test/regression/vec_array_windows.cpp b/sycl/test/regression/vec_array_windows.cpp new file mode 100644 index 0000000000000..d9754bc2e4e93 --- /dev/null +++ b/sycl/test/regression/vec_array_windows.cpp @@ -0,0 +1,15 @@ +// Test to isolate sycl::vec regression after +// https://github.com/intel/llvm/pull/14130. This PR caused sycl::vec to use +// std::array as its underlying storage. However, operations on std::array +// may emit debug-mode-only functions, on which the device compiler may fail. + +// REQUIRES: windows + +// RUN: %clangxx -fsycl -D_DEBUG %s -fsycl-device-only -Xclang -verify %s -Xclang -verify-ignore-unexpected=note,warning + +#include + +// expected-error@* {{SYCL kernel cannot call a variadic function}} +// expected-error@* {{SYCL kernel cannot call an undefined function without SYCL_EXTERNAL attribute}} +// expected-error@* {{SYCL kernel cannot call an undefined function without SYCL_EXTERNAL attribute}} +SYCL_EXTERNAL auto GetFirstElement(sycl::vec v) { return v[0]; } From 1b5ed8cbdd9933370543bf619d7b74848d1ec3ca Mon Sep 17 00:00:00 2001 From: Julian Oppermann Date: Tue, 28 Jan 2025 20:43:41 +1300 Subject: [PATCH 24/45] [SYCL][RTC] Extend device code split E2E test (#16685) Extends test to cover all explicit device code split modes, and adds a test for implicit device code splitting. --------- Signed-off-by: Julian Oppermann --- .../kernel_compiler_sycl_jit.cpp | 75 +++++++++++++++++-- 1 file changed, 67 insertions(+), 8 deletions(-) diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp index 1e69781307ffc..5aca89e5be8a4 100644 --- a/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp +++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_sycl_jit.cpp @@ -82,6 +82,18 @@ void vector_add_esimd(float *A, float *B, float *C) { } )==="; +auto constexpr DeviceCodeSplitSource = R"===( +#include + +template SYCL_EXTERNAL +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(sycl::ext::oneapi::experimental::nd_range_kernel<1>) +[[sycl::reqd_work_group_size(WG)]] +void vec_add(T* in1, T* in2, T* out){ + size_t id = sycl::ext::oneapi::this_work_item::get_nd_item<1>().get_global_linear_id(); + out[id] = in1[id] + in2[id]; +} +)==="; + auto constexpr BadSource = R"===( #include @@ -206,12 +218,7 @@ int test_build_and_run() { ctx, syclex::source_language::sycl_jit, SYCLSource, syclex::properties{incFiles2}); - exe_kb kbExe3 = syclex::build( - kbSrc2, syclex::properties{ - syclex::build_options{"-fsycl-device-code-split=per_kernel"}, - syclex::registered_kernel_names{"ff_templated"}}); - assert(std::distance(kbExe3.begin(), kbExe3.end()) == 2 && - "Expected 2 device images"); + exe_kb kbExe3 = syclex::build(kbSrc2); sycl::kernel k3 = kbExe3.ext_oneapi_get_kernel("ff_cp"); test_1(q, k3, 37 + 7); @@ -222,6 +229,58 @@ int test_build_and_run() { return 0; } +int test_device_code_split() { + namespace syclex = sycl::ext::oneapi::experimental; + using source_kb = sycl::kernel_bundle; + using exe_kb = sycl::kernel_bundle; + + sycl::queue q; + sycl::context ctx = q.get_context(); + + bool ok = + q.get_device().ext_oneapi_can_compile(syclex::source_language::sycl_jit); + if (!ok) { + std::cout << "Apparently this device does not support `sycl_jit` source " + "kernel bundle extension: " + << q.get_device().get_info() + << std::endl; + return -1; + } + + source_kb kbSrc = syclex::create_kernel_bundle_from_source( + ctx, syclex::source_language::sycl_jit, DeviceCodeSplitSource); + + // Test explicit device code split + std::vector names{"vec_add", "vec_add", + "vec_add"}; + auto build = [&](const std::string &mode) -> size_t { + exe_kb kbExe = syclex::build( + kbSrc, syclex::properties{ + syclex::registered_kernel_names{names}, + syclex::build_options{"-fsycl-device-code-split=" + mode}}); + return std::distance(kbExe.begin(), kbExe.end()); + }; + + size_t perKernelNImg = build("per_kernel"); + size_t perSourceNImg = build("per_source"); + size_t offNImg = build("off"); + size_t autoNImg = build("auto"); + + assert(perKernelNImg == 3); + assert(perSourceNImg == 1); + assert(offNImg == 1); + assert(autoNImg >= offNImg && autoNImg <= perKernelNImg); + + // Test implicit device code split + names = {"vec_add", "vec_add"}; + exe_kb kbDiffWorkGroupSizes = syclex::build( + kbSrc, syclex::properties{syclex::registered_kernel_names{names}}); + assert(std::distance(kbDiffWorkGroupSizes.begin(), + kbDiffWorkGroupSizes.end()) == 2); + + return 0; +} + int test_esimd() { namespace syclex = sycl::ext::oneapi::experimental; using source_kb = sycl::kernel_bundle; @@ -393,8 +452,8 @@ int test_warning() { int main(int argc, char **) { #ifdef SYCL_EXT_ONEAPI_KERNEL_COMPILER int optional_tests = (argc > 1) ? test_warning() : 0; - return test_build_and_run() || test_esimd() || test_unsupported_options() || - test_error() || optional_tests; + return test_build_and_run() || test_device_code_split() || test_esimd() || + test_unsupported_options() || test_error() || optional_tests; #else static_assert(false, "Kernel Compiler feature test macro undefined"); #endif From 27353ed4de9ad0f450de2213fc0fa5e275e6153d Mon Sep 17 00:00:00 2001 From: Harald van Dijk Date: Tue, 28 Jan 2025 07:44:47 +0000 Subject: [PATCH 25/45] [SYCL][NativeCPU] Update OCK. (#16785) Update to newer OCK to pull in fixes for subgroups. https://github.com/uxlfoundation/oneapi-construction-kit/compare/d983db7aa87fc1a6f7cdb46e3ced63f6f145749e...846a5c6118826171fb1c93702dd12b004053165c --- llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt b/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt index bbfb74f7a3529..bc059057d024f 100644 --- a/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt +++ b/llvm/lib/SYCLNativeCPUUtils/CMakeLists.txt @@ -34,16 +34,16 @@ endif() if(NATIVECPU_USE_OCK) if(NATIVECPU_OCK_USE_FETCHCONTENT) - set(OCK_GIT_INTERNAL_REPO "https://github.com/codeplaysoftware/oneapi-construction-kit.git") - # commit d983db7aa87fc1a6f7cdb46e3ced63f6f145749e - # Merge: 1d3a925c 2c510ca2 + set(OCK_GIT_INTERNAL_REPO "https://github.com/uxlfoundation/oneapi-construction-kit.git") + # commit 846a5c6118826171fb1c93702dd12b004053165c + # Merge: 81355afb 11967b11 # Author: Harald van Dijk - # Date: Tue Oct 15 15:50:57 2024 +0100 + # Date: Fri Jan 24 15:41:13 2025 +0000 # - # Merge pull request #566 from hvdijk/fix-clang-format + # Merge pull request #659 from hvdijk/vec-size-1 # - # clang-format: fix output. - set(OCK_GIT_INTERNAL_TAG d983db7aa87fc1a6f7cdb46e3ced63f6f145749e) + # [vecz] Handle vectors of size 1. + set(OCK_GIT_INTERNAL_TAG 846a5c6118826171fb1c93702dd12b004053165c) # Overwrite OCK_GIT_INTERNAL_REPO/OCK_GIT_INTERNAL_TAG if the corresponding options are set if(OCK_GIT_REPO) From cf19f7758c6ed3b6bdbf0c2fc4cefbe08cbaabd3 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Tue, 28 Jan 2025 12:34:13 +0000 Subject: [PATCH 26/45] [UR] fix parseDisjointPoolConfig and add tests (#16791) https://github.com/oneapi-src/unified-runtime/pull/2574 --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 7b46bd5b034c9..4ca76b207fdf3 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 0bb6789f0113ea937d861fd67fd677b91ecdeb8b -# Merge: e370a2b9 eeff9f4a +# commit 095e8464124a48c8ed4b995403e754254c072143 +# Merge: 0bb6789f 07001aa7 # Author: Kenneth Benzie (Benie) -# Date: Mon Jan 27 10:40:02 2025 +0000 -# Merge pull request #2551 from przemektmalon/przemek/bindless-images-host-usm -# Enable creation of bindless images backed by host USM -set(UNIFIED_RUNTIME_TAG 0bb6789f0113ea937d861fd67fd677b91ecdeb8b) +# Date: Mon Jan 27 14:53:22 2025 +0000 +# Merge pull request #2574 from bratpiorka/rrudnick_fix_usm_pool_config_parse +# fix parseDisjointPoolConfig and add tests +set(UNIFIED_RUNTIME_TAG 095e8464124a48c8ed4b995403e754254c072143) From 4c198eef7ae3ef53651867e0556a944110fff430 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Tue, 28 Jan 2025 13:51:48 +0000 Subject: [PATCH 27/45] [SYCL][Graph] Re-enable SLM update tests (#16798) Re-enable whole graph update tests using local memory on Level-Zero. --- sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp | 3 --- .../Graph/Update/Explicit/whole_update_local_acc_multi.cpp | 3 --- .../Graph/Update/Explicit/whole_update_work_group_memory.cpp | 3 --- .../Graph/Update/RecordReplay/whole_update_local_acc.cpp | 3 --- .../Graph/Update/RecordReplay/whole_update_local_acc_multi.cpp | 3 --- .../Update/RecordReplay/whole_update_work_group_memory.cpp | 3 --- 6 files changed, 18 deletions(-) diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp index a99eec42afa02..1db9905457ae7 100644 --- a/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp +++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-422 - #define GRAPH_E2E_EXPLICIT #include "../../Inputs/whole_update_local_acc.cpp" diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc_multi.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc_multi.cpp index d15ec880d89ad..e1a9ccf9a941d 100644 --- a/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc_multi.cpp +++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_local_acc_multi.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-422 - #define GRAPH_E2E_EXPLICIT #include "../../Inputs/whole_update_local_acc_multi.cpp" diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_work_group_memory.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_work_group_memory.cpp index c2e4f396f2281..4a5612a23f46f 100644 --- a/sycl/test-e2e/Graph/Update/Explicit/whole_update_work_group_memory.cpp +++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_work_group_memory.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-422 - #define GRAPH_E2E_EXPLICIT #include "../../Inputs/whole_update_work_group_memory.cpp" diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp index ab9bb3fe37fa5..03645b2f19bfd 100644 --- a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp +++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-422 - #define GRAPH_E2E_RECORD_REPLAY #include "../../Inputs/whole_update_local_acc.cpp" diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc_multi.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc_multi.cpp index 97a83de1129b3..f953915379641 100644 --- a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc_multi.cpp +++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_local_acc_multi.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-422 - #define GRAPH_E2E_RECORD_REPLAY #include "../../Inputs/whole_update_local_acc_multi.cpp" diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_work_group_memory.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_work_group_memory.cpp index 1a8183e526dc0..ebdc883475b05 100644 --- a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_work_group_memory.cpp +++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_work_group_memory.cpp @@ -5,9 +5,6 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// XFAIL: level_zero -// XFAIL-TRACKER: OFNAAO-422 - #define GRAPH_E2E_RECORD_REPLAY #include "../../Inputs/whole_update_work_group_memory.cpp" From fee444922c566ef85da8bb5f30dba2bbd46ecbd6 Mon Sep 17 00:00:00 2001 From: David Garcia Orozco Date: Tue, 28 Jan 2025 07:59:12 -0700 Subject: [PATCH 28/45] [SYCL][Matrix][E2E] Remove `REQUIRES: build-and-run-mode` from Matrix tests (#16787) As of #16725 tests that do not build for `spir64` do not need to be marked as exceptions for split build/run with `REQUIRES: build-and-run-mode`, instead we can mark them as `REQUIRES: target-`. This patch replaces the `REQUIRES: build-and-run-mode` directives in Matrix tests with `REQUIRES: target-amd` or `REQUIRES: target-nvidia`. --- sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp | 2 +- sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp | 2 +- sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp | 3 +-- sycl/test-e2e/Matrix/runtime_query_hip_gfx90a.cpp | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp b/sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp index fbcf2c70558f4..5db815ca081fb 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_hip_gfx90a.cpp @@ -9,8 +9,8 @@ // RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx90a %s -o %t.out // RUN: %{run} %t.out +// REQUIRES: target-amd // REQUIRES: arch-amd_gpu_gfx90a -// REQUIRES: build-and-run-mode #include "joint_matrix_hip_apply.hpp" #include "joint_matrix_hip_copy.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp b/sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp index 7c8d576ba39d0..8870e36a497fa 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_hip_half_gfx90a.cpp @@ -9,9 +9,9 @@ // RUN: %clangxx -fsycl -fsycl-targets=amd_gpu_gfx90a %s -o %t.out // RUN: %{run} %t.out +// REQUIRES: target-amd // REQUIRES: arch-amd_gpu_gfx90a // REQUIRES: aspect-fp16 -// REQUIRES: build-and-run-mode #include "joint_matrix_hip_apply.hpp" #include "joint_matrix_hip_copy.hpp" diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp index dc084b7c23c0e..bfa1156bc0e6b 100644 --- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp +++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp @@ -6,8 +6,7 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: cuda -// REQUIRES: build-and-run-mode +// REQUIRES: target-nvidia // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_70 -o %t.out // RUN: %{run} %t.out // diff --git a/sycl/test-e2e/Matrix/runtime_query_hip_gfx90a.cpp b/sycl/test-e2e/Matrix/runtime_query_hip_gfx90a.cpp index a7bdd0c056db1..a47a3ef725b73 100644 --- a/sycl/test-e2e/Matrix/runtime_query_hip_gfx90a.cpp +++ b/sycl/test-e2e/Matrix/runtime_query_hip_gfx90a.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// +// REQUIRES: target-amd // REQUIRES: arch-amd_gpu_gfx90a -// REQUIRES: build-and-run-mode // RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend=amdgcn-amd-amdhsa --offload-arch=gfx90a %s -o %t.out // RUN: %{run} %t.out From cf1df786215da57ad88335134ee9cc5e9fba04c7 Mon Sep 17 00:00:00 2001 From: Udit Kumar Agarwal Date: Tue, 28 Jan 2025 07:02:16 -0800 Subject: [PATCH 29/45] Revert "[CI] Temporarily disable tests requiring `spirv-tools` in CI" (#16801) Reverts intel/llvm#16743. After installing `pkg-config` in CI, `llvm-spirv` can correctly find `spriv-tools` so this workaround is no longer needed. Example run: https://github.com/intel/llvm/actions/runs/12996166493/job/36244369469?pr=16801#step:14:16 --- .github/workflows/sycl-linux-build.yml | 3 --- .github/workflows/sycl-windows-build.yml | 3 --- llvm-spirv/test/lit.cfg.py | 11 ++++------- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/.github/workflows/sycl-linux-build.yml b/.github/workflows/sycl-linux-build.yml index 412e081eb9cb1..4b5a76e73c6fc 100644 --- a/.github/workflows/sycl-linux-build.yml +++ b/.github/workflows/sycl-linux-build.yml @@ -195,9 +195,6 @@ jobs: cmake --build $GITHUB_WORKSPACE/build --target check-sycl-unittests - name: check-llvm-spirv if: always() && !cancelled() && contains(inputs.changes, 'llvm_spirv') - # Temporary workaround to disable running tests requiring spirv-tools. - env: - LIT_OPTS: "--param disable-spirv-tools=True" run: | cmake --build $GITHUB_WORKSPACE/build --target check-llvm-spirv - name: check-xptifw diff --git a/.github/workflows/sycl-windows-build.yml b/.github/workflows/sycl-windows-build.yml index ef9c75b860539..bd6ea38d5f738 100644 --- a/.github/workflows/sycl-windows-build.yml +++ b/.github/workflows/sycl-windows-build.yml @@ -145,9 +145,6 @@ jobs: cmake --build build --target check-sycl-unittests - name: check-llvm-spirv if: always() && !cancelled() && contains(inputs.changes, 'llvm_spirv') - # Temporary workaround to disable running tests requiring spirv-tools. - env: - LIT_OPTS: "--param disable-spirv-tools=True" run: | cmake --build build --target check-llvm-spirv - name: check-xptifw diff --git a/llvm-spirv/test/lit.cfg.py b/llvm-spirv/test/lit.cfg.py index 29b8a6e4dd3f6..b56953aa56d01 100644 --- a/llvm-spirv/test/lit.cfg.py +++ b/llvm-spirv/test/lit.cfg.py @@ -60,27 +60,24 @@ using_spirv_tools = False -# Explicitly disable using spirv tools, if requested. -disable_spirv_tools = lit_config.params.get("disable-spirv-tools", False) - -if config.spirv_tools_have_spirv_as and not disable_spirv_tools: +if config.spirv_tools_have_spirv_as: llvm_config.add_tool_substitutions(['spirv-as'], [config.spirv_tools_bin_dir]) config.available_features.add('spirv-as') using_spirv_tools = True -if config.spirv_tools_have_spirv_dis and not disable_spirv_tools: +if config.spirv_tools_have_spirv_dis: llvm_config.add_tool_substitutions(['spirv-dis'], [config.spirv_tools_bin_dir]) config.available_features.add('spirv-dis') using_spirv_tools = True -if config.spirv_tools_have_spirv_link and not disable_spirv_tools: +if config.spirv_tools_have_spirv_link: llvm_config.add_tool_substitutions(['spirv-link'], [config.spirv_tools_bin_dir]) config.available_features.add('spirv-link') using_spirv_tools = True # Unlike spirv-{as,dis,link} above, running spirv-val is optional: if spirv-val is # not available, the test must still run and just skip any spirv-val commands. -if config.spirv_tools_have_spirv_val and not disable_spirv_tools: +if config.spirv_tools_have_spirv_val: llvm_config.add_tool_substitutions(['spirv-val'], [config.spirv_tools_bin_dir]) using_spirv_tools = True else: From a8ee28a6a2ca14716da66eb55e7ca04634a5fcaf Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Tue, 28 Jan 2025 16:06:13 +0100 Subject: [PATCH 30/45] [E2E] Update the current XFAIL status of e2e tests with the SPIR-V Backend (#16692) This PR updates the current XFAIL status of e2e tests with the SPIR-V Backend to keep CI runs with the SPIR-V Backend green. Freshly validated by the following workflow runs: - CPU: https://github.com/intel/llvm/actions/runs/12870111017 - GPU: https://github.com/intel/llvm/actions/runs/12871331751 --- sycl/test-e2e/DeprecatedFeatures/opencl_interop.cpp | 3 +++ sycl/test-e2e/GroupAlgorithm/load_store/conversions_store.cpp | 3 +++ sycl/test-e2e/MemorySanitizer/check_buffer_memset_memcpy.cpp | 3 +++ sycl/test-e2e/MemorySanitizer/check_device_global.cpp | 3 +++ sycl/test-e2e/MemorySanitizer/check_usm.cpp | 3 +++ sycl/test-e2e/Reduction/reduction_nd_N_vars.cpp | 4 ---- 6 files changed, 15 insertions(+), 4 deletions(-) diff --git a/sycl/test-e2e/DeprecatedFeatures/opencl_interop.cpp b/sycl/test-e2e/DeprecatedFeatures/opencl_interop.cpp index fece65c39706a..755ecfb30d731 100644 --- a/sycl/test-e2e/DeprecatedFeatures/opencl_interop.cpp +++ b/sycl/test-e2e/DeprecatedFeatures/opencl_interop.cpp @@ -3,6 +3,9 @@ // RUN: %{build} -D__SYCL_INTERNAL_API -o %t.out %opencl_lib // RUN: %{run} %t.out +// XFAIL: spirv-backend && cpu +// XFAIL-TRACKER: CMPLRLLVM-64705 + #include #include #include diff --git a/sycl/test-e2e/GroupAlgorithm/load_store/conversions_store.cpp b/sycl/test-e2e/GroupAlgorithm/load_store/conversions_store.cpp index 7d6460b7ba5ff..ebbbbadedd089 100644 --- a/sycl/test-e2e/GroupAlgorithm/load_store/conversions_store.cpp +++ b/sycl/test-e2e/GroupAlgorithm/load_store/conversions_store.cpp @@ -1,6 +1,9 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out +// XFAIL: spirv-backend && cpu +// XFAIL-TRACKER: CMPLRLLVM-64705 + #include #include diff --git a/sycl/test-e2e/MemorySanitizer/check_buffer_memset_memcpy.cpp b/sycl/test-e2e/MemorySanitizer/check_buffer_memset_memcpy.cpp index a1f676a1933ef..3447a51ea353c 100644 --- a/sycl/test-e2e/MemorySanitizer/check_buffer_memset_memcpy.cpp +++ b/sycl/test-e2e/MemorySanitizer/check_buffer_memset_memcpy.cpp @@ -4,6 +4,9 @@ // RUN: %{build} %device_msan_flags -O2 -g -o %t2.out // RUN: %{run} %t2.out 2>&1 | FileCheck %s +// XFAIL: spirv-backend +// XFAIL-TRACKER: CMPLRLLVM-64705 + #include __attribute__((noinline)) int foo(int data1, int data2) { diff --git a/sycl/test-e2e/MemorySanitizer/check_device_global.cpp b/sycl/test-e2e/MemorySanitizer/check_device_global.cpp index f8b47569deb9b..40029a61d825f 100644 --- a/sycl/test-e2e/MemorySanitizer/check_device_global.cpp +++ b/sycl/test-e2e/MemorySanitizer/check_device_global.cpp @@ -6,6 +6,9 @@ // RUN: %{build} %device_msan_flags -O2 -g -o %t3.out // RUN: %{run} not %t3.out 2>&1 | FileCheck %s +// XFAIL: spirv-backend +// XFAIL-TRACKER: CMPLRLLVM-64705 + #include #include #include diff --git a/sycl/test-e2e/MemorySanitizer/check_usm.cpp b/sycl/test-e2e/MemorySanitizer/check_usm.cpp index b28895b1391a8..be84e59b1ab0c 100644 --- a/sycl/test-e2e/MemorySanitizer/check_usm.cpp +++ b/sycl/test-e2e/MemorySanitizer/check_usm.cpp @@ -7,6 +7,9 @@ // UNSUPPORTED: cpu // UNSUPPORTED-TRACKER: CMPLRLLVM-64618 +// XFAIL: spirv-backend && gpu +// XFAIL-TRACKER: CMPLRLLVM-64705 + #include #include diff --git a/sycl/test-e2e/Reduction/reduction_nd_N_vars.cpp b/sycl/test-e2e/Reduction/reduction_nd_N_vars.cpp index ad65c42fccf70..4ea1fe420e31c 100644 --- a/sycl/test-e2e/Reduction/reduction_nd_N_vars.cpp +++ b/sycl/test-e2e/Reduction/reduction_nd_N_vars.cpp @@ -4,10 +4,6 @@ // Windows doesn't yet have full shutdown(). // UNSUPPORTED: ze_debug && windows -// Depends on SPIR-V Backend & run-time drivers version. -// XFAIL: spirv-backend && gpu -// XFAIL-TRACKER: CMPLRLLVM-64705 - // This test checks handling of parallel_for() accepting nd_range and // two or more reductions. From ae85db7a59d41ba6213cf33dc70296ee8268a822 Mon Sep 17 00:00:00 2001 From: Daniel Skrobot Date: Tue, 28 Jan 2025 16:07:39 +0100 Subject: [PATCH 31/45] [UR] Addition of num_compute_units query (#16538) Co-authored-by: Greg Lueck Co-authored-by: Kenneth Benzie (Benie) --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 14 ++++---- ...sycl_ext_oneapi_num_compute_units.asciidoc | 7 +--- .../sycl/info/ext_oneapi_device_traits.def | 4 +++ sycl/source/feature_test.hpp.in | 1 + .../NumComputeUnits/num_compute_units.cpp | 27 ++++++++++++++ sycl/test/abi/sycl_symbols_linux.dump | 2 ++ sycl/test/abi/sycl_symbols_windows.dump | 2 ++ sycl/unittests/Extensions/CMakeLists.txt | 1 + .../Extensions/NumComputeUnits/CMakeLists.txt | 4 +++ .../NumComputeUnits/ReturnedQueryValue.cpp | 36 +++++++++++++++++++ 10 files changed, 85 insertions(+), 13 deletions(-) rename sycl/doc/extensions/{proposed => supported}/sycl_ext_oneapi_num_compute_units.asciidoc (92%) create mode 100644 sycl/test-e2e/NumComputeUnits/num_compute_units.cpp create mode 100644 sycl/unittests/Extensions/NumComputeUnits/CMakeLists.txt create mode 100644 sycl/unittests/Extensions/NumComputeUnits/ReturnedQueryValue.cpp diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 4ca76b207fdf3..85ea961d9468e 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 095e8464124a48c8ed4b995403e754254c072143 -# Merge: 0bb6789f 07001aa7 -# Author: Kenneth Benzie (Benie) -# Date: Mon Jan 27 14:53:22 2025 +0000 -# Merge pull request #2574 from bratpiorka/rrudnick_fix_usm_pool_config_parse -# fix parseDisjointPoolConfig and add tests -set(UNIFIED_RUNTIME_TAG 095e8464124a48c8ed4b995403e754254c072143) +# commit 78e1b33271d28d26845a4bfae7ae3b72c14e0e63 +# Merge: 902bb2e2 94b32ac2 +# Author: Ross Brunton +# Date: Tue Jan 28 11:02:44 2025 +0000 +# Merge pull request #2624 from RossBrunton/ross/msanfix +# Assert that Device is valid for memory poisoning +set(UNIFIED_RUNTIME_TAG 78e1b33271d28d26845a4bfae7ae3b72c14e0e63) diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_num_compute_units.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_num_compute_units.asciidoc similarity index 92% rename from sycl/doc/extensions/proposed/sycl_ext_oneapi_num_compute_units.asciidoc rename to sycl/doc/extensions/supported/sycl_ext_oneapi_num_compute_units.asciidoc index 5e432acf00588..b1e33f6543ad6 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_num_compute_units.asciidoc +++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_num_compute_units.asciidoc @@ -44,12 +44,7 @@ SYCL specification refer to that revision. == Status -This is a proposed extension specification, intended to gather community -feedback. Interfaces defined in this specification may not be implemented yet -or may be in a preliminary state. The specification itself may also change in -incompatible ways before it is finalized. *Shipping software products should -not rely on APIs defined in this specification.* - +This extension is implemented and fully supported by {dpcpp}. == Overview diff --git a/sycl/include/sycl/info/ext_oneapi_device_traits.def b/sycl/include/sycl/info/ext_oneapi_device_traits.def index 813ec952b20d1..d9269b7557b07 100644 --- a/sycl/include/sycl/info/ext_oneapi_device_traits.def +++ b/sycl/include/sycl/info/ext_oneapi_device_traits.def @@ -81,6 +81,10 @@ __SYCL_PARAM_TRAITS_SPEC(ext::oneapi::experimental, device, composite_device, sycl::device, UR_DEVICE_INFO_COMPOSITE_DEVICE) +__SYCL_PARAM_TRAITS_SPEC(ext::oneapi, device, + num_compute_units, size_t, + UR_DEVICE_INFO_NUM_COMPUTE_UNITS) + #ifdef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC #undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in index 78199433a9e8b..f062302257e4b 100644 --- a/sycl/source/feature_test.hpp.in +++ b/sycl/source/feature_test.hpp.in @@ -112,6 +112,7 @@ inline namespace _V1 { #define SYCL_EXT_ONEAPI_WORK_GROUP_MEMORY 1 #define SYCL_EXT_ONEAPI_WORK_GROUP_SCRATCH_MEMORY 1 #define SYCL_EXT_ONEAPI_WORK_GROUP_STATIC 1 +#define SYCL_EXT_ONEAPI_NUM_COMPUTE_UNITS 1 // In progress yet #define SYCL_EXT_ONEAPI_ATOMIC16 0 diff --git a/sycl/test-e2e/NumComputeUnits/num_compute_units.cpp b/sycl/test-e2e/NumComputeUnits/num_compute_units.cpp new file mode 100644 index 0000000000000..03a8558734747 --- /dev/null +++ b/sycl/test-e2e/NumComputeUnits/num_compute_units.cpp @@ -0,0 +1,27 @@ +// This test checks whether the number of compute units in the device descriptor +// returns a valid value. + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +#include +#include + +int main() { + +#ifdef SYCL_EXT_ONEAPI_NUM_COMPUTE_UNITS + sycl::queue Queue; + sycl::device Device = Queue.get_device(); + + size_t NumberComputeUnits = + Device.get_info(); + + assert(NumberComputeUnits >= 1 && + "The minimum value for number of compute units in the device is 1"); + +#else + static_assert(false, "SYCL_EXT_ONEAPI_NUM_COMPUTE_UNITS not defined"); +#endif + + return 0; +} diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 224dd494a6258..7dd3c11d945b8 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -4029,3 +4029,5 @@ _ZNK4sycl3_V19kernel_id8get_nameEv _ZNKSt4hashIN4sycl3_V15queueEEclERKS2_ __sycl_register_lib __sycl_unregister_lib +_ZNK4sycl3_V16detail11device_impl8get_infoINS0_3ext6oneapi4info6device17num_compute_unitsEEENT_11return_typeEv +_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi4info6device17num_compute_unitsEEENS0_6detail11ABINeutralTINS8_19is_device_info_descIT_E11return_typeEE4typeEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index da02292f0028b..a0649afb91058 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4344,3 +4344,5 @@ DllMain __sycl_register_lib __sycl_unregister_lib +??$get_info@Unum_compute_units@device@info@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA_KXZ +??$get_info_impl@Unum_compute_units@device@info@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA_KXZ diff --git a/sycl/unittests/Extensions/CMakeLists.txt b/sycl/unittests/Extensions/CMakeLists.txt index 5fa5d04d39317..8498f7bccfdd3 100644 --- a/sycl/unittests/Extensions/CMakeLists.txt +++ b/sycl/unittests/Extensions/CMakeLists.txt @@ -24,3 +24,4 @@ add_sycl_unittest(ExtensionsTests OBJECT add_subdirectory(CommandGraph) add_subdirectory(VirtualFunctions) add_subdirectory(VirtualMemory) +add_subdirectory(NumComputeUnits) diff --git a/sycl/unittests/Extensions/NumComputeUnits/CMakeLists.txt b/sycl/unittests/Extensions/NumComputeUnits/CMakeLists.txt new file mode 100644 index 0000000000000..fe60339444d19 --- /dev/null +++ b/sycl/unittests/Extensions/NumComputeUnits/CMakeLists.txt @@ -0,0 +1,4 @@ +add_sycl_unittest(NumComputeUnitsTests OBJECT + ReturnedQueryValue.cpp +) + diff --git a/sycl/unittests/Extensions/NumComputeUnits/ReturnedQueryValue.cpp b/sycl/unittests/Extensions/NumComputeUnits/ReturnedQueryValue.cpp new file mode 100644 index 0000000000000..71190521e4536 --- /dev/null +++ b/sycl/unittests/Extensions/NumComputeUnits/ReturnedQueryValue.cpp @@ -0,0 +1,36 @@ +#include + +#include "ur_api.h" + +#include +#include + +template +ur_result_t after_urDeviceGetInfo(void *pParams) { + auto params = reinterpret_cast(pParams); + if (*params->ppropName == UR_DEVICE_INFO_NUM_COMPUTE_UNITS) { + if (*params->ppPropValue) + *static_cast(*params->ppPropValue) = ExpectedValue; + if (*params->ppPropSizeRet) + **params->ppPropSizeRet = sizeof(uint32_t); + } + return UR_RESULT_SUCCESS; +} + +TEST(NumComputeUnitsTests, CheckExpectedValue) { + + constexpr uint32_t ExpectedNumComputeUnits = 111; + + sycl::unittest::UrMock<> Mock; + sycl::platform Platform = sycl::platform(); + sycl::queue Queue{Platform.get_devices()[0]}; + + mock::getCallbacks().set_after_callback( + "urDeviceGetInfo", &after_urDeviceGetInfo); + + size_t NumberComputeUnits = + Queue.get_device() + .get_info(); + + EXPECT_EQ(NumberComputeUnits, ExpectedNumComputeUnits); +} From a4aa7f07cba16d482eb49187602f5295cfdaced0 Mon Sep 17 00:00:00 2001 From: Nikita Kornev Date: Tue, 28 Jan 2025 16:20:35 +0100 Subject: [PATCH 32/45] [CI] Ping assignees after 90 days (#16815) The workflow pings assignees of issues if there are no updates for the last 60 days. Increase to 90 to reduce the number of these messages. --- .github/workflows/sycl-issues-ping-assignee.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/sycl-issues-ping-assignee.yml b/.github/workflows/sycl-issues-ping-assignee.yml index c809d67586e71..adb4c2e5b658d 100644 --- a/.github/workflows/sycl-issues-ping-assignee.yml +++ b/.github/workflows/sycl-issues-ping-assignee.yml @@ -20,7 +20,7 @@ jobs: run: permissions: issues: write - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO: ${{ github.repository }} @@ -39,7 +39,7 @@ jobs: - name: Filter issues and ping run: | - days_to_stale=60 + days_to_stale=90 current_time=$(date +%s) cat issues.json | jq -c '.[]' | while read -r issue; do From b9b1f884a61cff1d576eb65e6ea80b5f3fe88880 Mon Sep 17 00:00:00 2001 From: Wenju He Date: Tue, 28 Jan 2025 16:33:16 +0000 Subject: [PATCH 33/45] [NFC][libclc] Fix clcfunc.h merge conflict resolve in b400aa4586c4 (#16809) These 4 macros were moved in 4602c16a6823, but added back by b400aa4586c4, resuling in duplicate definitions in this file. --- libclc/clc/include/clc/clcfunc.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/libclc/clc/include/clc/clcfunc.h b/libclc/clc/include/clc/clcfunc.h index e07bcd0af0ebe..6c95a07b1c184 100644 --- a/libclc/clc/include/clc/clcfunc.h +++ b/libclc/clc/include/clc/clcfunc.h @@ -18,9 +18,4 @@ #define _CLC_DEF __attribute__((always_inline)) #endif -#define _CLC_INLINE __attribute__((always_inline)) inline -#define _CLC_CONVERGENT __attribute__((convergent)) -#define _CLC_PURE __attribute__((pure)) -#define _CLC_CONSTFN __attribute__((const)) - #endif // __CLC_CLCFUNC_H_ From 981e74530393143c611eb641d7e8ee3203f089fa Mon Sep 17 00:00:00 2001 From: Yury Plyakhin Date: Tue, 28 Jan 2025 09:04:47 -0800 Subject: [PATCH 34/45] [SYCL][E2E][Joint Matrix] New test transpose A and B (#16684) --- sycl/test-e2e/Matrix/Inputs/common.hpp | 19 ++- .../Matrix/joint_matrix_transposeAB.cpp | 133 ++++++++++++++++++ 2 files changed, 149 insertions(+), 3 deletions(-) create mode 100644 sycl/test-e2e/Matrix/joint_matrix_transposeAB.cpp diff --git a/sycl/test-e2e/Matrix/Inputs/common.hpp b/sycl/test-e2e/Matrix/Inputs/common.hpp index 58937722642df..dca215ae574d2 100644 --- a/sycl/test-e2e/Matrix/Inputs/common.hpp +++ b/sycl/test-e2e/Matrix/Inputs/common.hpp @@ -183,10 +183,10 @@ bool matrix_compare(unsigned int rows, unsigned int cols, T1 *src, T2 *ref) { } } else if constexpr (exact || std::is_integral_v) { if (src[i * cols + j] != ref[i * cols + j]) { - std::cerr << "Incorrect result in matrix." + std::cerr << "Incorrect result in matrix. " << "i: " << i << ", j: " << j - << ", Ref: " << ref[i * cols + j] - << ", Val: " << src[i * cols + j] << "\n"; + << ", Ref: " << (int)ref[i * cols + j] + << ", Val: " << (int)src[i * cols + j] << "\n"; return false; } } else { @@ -221,3 +221,16 @@ template size_t get_sg_size(queue q) { .template get_info( q.get_device()); } + +template +void matrix_print(unsigned int rows, unsigned int cols, T *mat) { + for (unsigned int i = 0; i < rows; i++) { + for (unsigned int j = 0; j < cols; j++) { + if constexpr (std::is_integral_v) + std::cout << (int)mat[i * cols + j] << " "; + else + std::cout << (float)mat[i * cols + j] << " "; + } + std::cout << "\n"; + } +} diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeAB.cpp b/sycl/test-e2e/Matrix/joint_matrix_transposeAB.cpp new file mode 100644 index 0000000000000..7a838d97a8336 --- /dev/null +++ b/sycl/test-e2e/Matrix/joint_matrix_transposeAB.cpp @@ -0,0 +1,133 @@ +//===---joint_matrix_transposeAB.cpp - DPC++ joint_matrix--------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: aspect-ext_intel_matrix + +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out + +// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2 +// RUN: %if !arch-intel_gpu_dg2 %{ %{build} -o %t_sg32.out -DSG_SZ=32 %} +// RUN: %if !arch-intel_gpu_dg2 %{ %{run} %t_sg32.out %} + +// XFAIL: gpu +// XFAIL-TRACKER: GSD-5768 + +// XFAIL: cpu +// XFAIL-TRACKER: CMPLRLLVM-52693 + +#include "common.hpp" +#include + +template class MT; + +template +void matrix_transpose(T *in, T *out, queue q) { + static_assert((NR % TR) == 0); + static_assert((NC % TC) == 0); + size_t sg_size = get_sg_size>(q); + std::cout << "SG size " << sg_size << " "; + + q.submit([&](handler &cgh) { + cgh.parallel_for>( + nd_range<2>({NR / TR, NC / TC * sg_size}, {1, 1 * sg_size}), + [=](nd_item<2> spmd_item) +#ifdef SG_SZ + [[sycl::reqd_sub_group_size(SG_SZ)]] +#endif + { + auto in_ptr = + address_space_cast(in); + auto out_ptr = + address_space_cast(out); + + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + matrix_row_major; + joint_matrix + matrix_col_major; + + auto row_major_offset = + (sg_startx * TR) * NC + sg_starty / sg_size * TC; + auto col_major_offset = + (sg_startx * TR) + (sg_starty / sg_size * TC) * NR; + + joint_matrix_load(sg, matrix_row_major, in_ptr + row_major_offset, + NC); + joint_matrix_copy(sg, matrix_row_major, matrix_col_major); + ext::intel::experimental::matrix::joint_matrix_store( + sg, matrix_col_major, out_ptr + col_major_offset, NR); + }); // parallel for + }).wait(); +} + +template void test() { + std::cout << "Test " << TR << " x " << TC << " "; + static constexpr size_t SCALE = 2; + static constexpr size_t MATRIX_R = TR * SCALE; + static constexpr size_t MATRIX_C = TC * SCALE; + + queue q; + T *in = malloc_shared(MATRIX_R * MATRIX_C, q); + T *col_major = malloc_shared(MATRIX_C * MATRIX_R, q); + T *ref_col_major = malloc_shared(MATRIX_C * MATRIX_R, q); + + matrix_rand(MATRIX_R, MATRIX_C, in, (T)5); + matrix_transpose(in, col_major, q); + matrix_transpose(MATRIX_R, MATRIX_C, ref_col_major, in); + assert((matrix_compare(MATRIX_C, MATRIX_R, col_major, + ref_col_major))); + std::cout << "PASSED\n"; + + free(in, q); + free(col_major, q); + free(ref_col_major, q); +} + +int main() { + queue q; + std::vector combinations = + q.get_device().get_info(); + bool bf16_run = false; + bool half_run = false; + bool int8_run = false; + + for (auto &combination : combinations) { + if (!bf16_run && combination.atype == matrix_type::bf16) { + std::cout << "bf16:\n"; + test(); + test(); + bf16_run = true; + } + + if (!half_run && combination.atype == matrix_type::fp16) { + std::cout << "half:\n"; + test(); + test(); + half_run = true; + } + + if (!int8_run && combination.atype == matrix_type::sint8) { + std::cout << "int8:\n"; + test(); + test(); + int8_run = true; + } + + if (bf16_run && half_run && int8_run) + break; + } + + return 0; +} From 67a3806f0aaeca77a1bf5bb6bdd7778d88b4bcff Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Tue, 28 Jan 2025 12:07:12 -0800 Subject: [PATCH 35/45] [CI] Simplify CI by using `alldeps` image whenever possible (#16806) First step on unifying our "base" image into a single "fat" container similarly to what's been done with nightly images at https://github.com/intel/llvm/pull/16680. --- .github/workflows/sycl-linux-precommit.yml | 11 ----------- .github/workflows/sycl-linux-run-tests.yml | 12 ++++++++---- .github/workflows/sycl-nightly.yml | 12 ------------ .github/workflows/sycl-post-commit.yml | 2 -- .github/workflows/sycl-rel-nightly.yml | 7 ------- .github/workflows/sycl-weekly.yml | 3 --- sycl/test-e2e/DeprecatedFeatures/opencl_interop.cpp | 13 ++++++++++--- .../KernelCompiler/kernel_compiler_spirv.cpp | 2 +- sycl/test-e2e/Regression/device_num.cpp | 3 +++ 9 files changed, 22 insertions(+), 43 deletions(-) diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml index 025944b1d2f12..36e85d98aca21 100644 --- a/.github/workflows/sycl-linux-precommit.yml +++ b/.github/workflows/sycl-linux-precommit.yml @@ -82,7 +82,6 @@ jobs: include: - name: Intel runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu;opencl:gpu;opencl:cpu reset_intel_gpu: true @@ -91,7 +90,6 @@ jobs: with: name: ${{ matrix.name }} runner: ${{ matrix.runner }} - image: ${{ matrix.image }} image_options: ${{ matrix.image_options }} target_devices: ${{ matrix.target_devices }} extra_lit_opts: --param fallback-to-build-if-requires-build-and-run=True ${{ matrix.extra_lit_opts }} @@ -112,18 +110,15 @@ jobs: include: - name: NVIDIA/CUDA runner: '["Linux", "cuda"]' - image: ghcr.io/intel/llvm/ubuntu2404_build:latest image_options: -u 1001 --gpus all --cap-add SYS_ADMIN target_devices: cuda:gpu - name: AMD/HIP runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2404_build:latest image_options: -u 1001 --device=/dev/dri --device=/dev/kfd target_devices: hip:gpu reset_intel_gpu: false - name: E2E tests on Intel Arc A-Series Graphics runner: '["Linux", "arc"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu;opencl:gpu reset_intel_gpu: true @@ -140,7 +135,6 @@ jobs: use_igc_dev: true - name: E2E tests on Intel Ponte Vecchio GPU runner: '["Linux", "pvc"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu;opencl:gpu extra_lit_opts: -j 50 @@ -197,27 +191,22 @@ jobs: include: - name: Intel GEN12 Graphics system runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_extra_opts: --device=/dev/dri reset_intel_gpu: true - name: Intel Arc A-Series Graphics system runner: '["Linux", "arc"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_extra_opts: --device=/dev/dri reset_intel_gpu: true - name: AMD system runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2404_build:latest image_extra_opts: --device=/dev/dri --device=/dev/kfd - name: CUDA system runner: '["Linux", "cuda"]' - image: ghcr.io/intel/llvm/ubuntu2404_build:latest image_extra_opts: --gpus all uses: ./.github/workflows/sycl-linux-run-tests.yml with: name: Perf tests on ${{ matrix.name }} runner: ${{ matrix. runner }} - image: ${{ matrix.image }} image_options: -u 1001 --privileged --cap-add SYS_ADMIN ${{ matrix.image_extra_opts }} target_devices: all reset_intel_gpu: ${{ matrix.reset_intel_gpu }} diff --git a/.github/workflows/sycl-linux-run-tests.yml b/.github/workflows/sycl-linux-run-tests.yml index 2ede8a2eb2299..d01c267692ea9 100644 --- a/.github/workflows/sycl-linux-run-tests.yml +++ b/.github/workflows/sycl-linux-run-tests.yml @@ -12,7 +12,7 @@ on: required: True image: type: string - required: True + required: False image_options: type: string required: True @@ -114,25 +114,29 @@ on: - '["Linux", "pvc"]' - '["cts-cpu"]' - '["Linux", "build"]' + - '["cuda"]' image: type: choice options: - 'ghcr.io/intel/llvm/sycl_ubuntu2404_nightly:latest' + - 'ghcr.io/intel/llvm/ubuntu2404_intel_drivers:alldeps' image_options: description: | Use option with "--device=/dev/kfd" for AMDGPU, without it for the rest. type: choice options: - - '-u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN' - '-u 1001 --device=/dev/dri --device=/dev/kfd --privileged --cap-add SYS_ADMIN' + - '-u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN' + - '-u 1001 --gpus all --cap-add SYS_ADMIN' target_devices: type: choice options: + - 'level_zero:gpu' - 'opencl:cpu' - 'opencl:gpu' - 'opencl:fpga' - - 'level_zero:gpu' - 'hip:gpu' + - 'cuda:gpu' tests_selector: type: choice options: @@ -182,7 +186,7 @@ jobs: name: ${{ inputs.name }} runs-on: ${{ fromJSON(inputs.runner) }} container: - image: ${{ inputs.image }} + image: ${{ inputs.image || 'ghcr.io/intel/llvm/ubuntu2404_intel_drivers:alldeps'}} options: ${{ inputs.image_options }} env: ${{ fromJSON(inputs.env) }} steps: diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml index 52510b54c76a5..899b9953ce0f1 100644 --- a/.github/workflows/sycl-nightly.yml +++ b/.github/workflows/sycl-nightly.yml @@ -58,14 +58,12 @@ jobs: include: - name: AMD/HIP runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2404_build:latest image_options: -u 1001 --device=/dev/dri --device=/dev/kfd target_devices: hip:gpu tests_selector: e2e - name: Intel L0 GPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu reset_intel_gpu: true @@ -73,7 +71,6 @@ jobs: - name: Intel OCL GPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: opencl:gpu reset_intel_gpu: true @@ -81,21 +78,18 @@ jobs: - name: OCL CPU (AMD) runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 target_devices: opencl:cpu tests_selector: e2e - name: OCL CPU (Intel/GEN12) runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --privileged --cap-add SYS_ADMIN target_devices: opencl:cpu tests_selector: e2e - name: OCL CPU (Intel/Arc) runner: '["Linux", "arc"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 target_devices: opencl:cpu tests_selector: e2e @@ -103,7 +97,6 @@ jobs: with: name: ${{ matrix.name }} runner: ${{ matrix.runner }} - image: ${{ matrix.image }} image_options: ${{ matrix.image_options }} target_devices: ${{ matrix.target_devices }} tests_selector: ${{ matrix.tests_selector }} @@ -123,7 +116,6 @@ jobs: runner: '["Linux", "pvc"]' target_devices: level_zero:gpu extra_lit_opts: -j 50 - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN ref: ${{ github.sha }} sycl_toolchain_artifact: sycl_linux_oneapi @@ -192,7 +184,6 @@ jobs: name: Build SYCL-CTS runner: '["Linux", "build"]' cts_testing_mode: 'build-only' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN tests_selector: cts ref: ${{ github.sha }} @@ -209,13 +200,11 @@ jobs: include: - name: SYCL-CTS on OCL CPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: opencl:cpu - name: SYCL-CTS on L0 gen12 runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu uses: ./.github/workflows/sycl-linux-run-tests.yml @@ -223,7 +212,6 @@ jobs: name: ${{ matrix.name }} runner: ${{ matrix.runner }} cts_testing_mode: 'run-only' - image: ${{ matrix.image }} image_options: ${{ matrix.image_options }} target_devices: ${{ matrix.target_devices }} tests_selector: cts diff --git a/.github/workflows/sycl-post-commit.yml b/.github/workflows/sycl-post-commit.yml index f3e4c224f897a..200a6a7e2129d 100644 --- a/.github/workflows/sycl-post-commit.yml +++ b/.github/workflows/sycl-post-commit.yml @@ -54,7 +54,6 @@ jobs: reset_intel_gpu: true - name: AMD/HIP runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2404_build:latest image_options: -u 1001 --device=/dev/dri --device=/dev/kfd target_devices: hip:gpu reset_intel_gpu: false @@ -80,7 +79,6 @@ jobs: with: name: ${{ matrix.name }} runner: ${{ matrix. runner }} - image: ${{ matrix.image || 'ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest' }} image_options: ${{ matrix.image_options || '-u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN' }} target_devices: ${{ matrix.target_devices || 'level_zero:gpu' }} reset_intel_gpu: ${{ matrix.reset_intel_gpu }} diff --git a/.github/workflows/sycl-rel-nightly.yml b/.github/workflows/sycl-rel-nightly.yml index 803eca00cf857..cbe99c647bb2d 100644 --- a/.github/workflows/sycl-rel-nightly.yml +++ b/.github/workflows/sycl-rel-nightly.yml @@ -54,14 +54,12 @@ jobs: include: - name: AMD/HIP runner: '["Linux", "amdgpu"]' - image: ghcr.io/intel/llvm/ubuntu2404_build:latest image_options: -u 1001 --device=/dev/dri --device=/dev/kfd target_devices: hip:gpu tests_selector: e2e - name: Intel L0 GPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu reset_intel_gpu: true @@ -70,7 +68,6 @@ jobs: - name: Intel OCL GPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri -v /dev/dri/by-path:/dev/dri/by-path --privileged --cap-add SYS_ADMIN target_devices: opencl:gpu reset_intel_gpu: true @@ -79,21 +76,18 @@ jobs: - name: Intel OCL CPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --privileged --cap-add SYS_ADMIN target_devices: opencl:cpu tests_selector: e2e - name: SYCL-CTS on OCL CPU runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: opencl:cpu tests_selector: cts - name: SYCL-CTS on L0 gen12 runner: '["Linux", "gen12"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu tests_selector: cts @@ -101,7 +95,6 @@ jobs: with: name: ${{ matrix.name }} runner: ${{ matrix.runner }} - image: ${{ matrix.image }} image_options: ${{ matrix.image_options }} target_devices: ${{ matrix.target_devices }} tests_selector: ${{ matrix.tests_selector }} diff --git a/.github/workflows/sycl-weekly.yml b/.github/workflows/sycl-weekly.yml index 0974470b972a2..3cdb146b6f436 100644 --- a/.github/workflows/sycl-weekly.yml +++ b/.github/workflows/sycl-weekly.yml @@ -46,13 +46,11 @@ jobs: include: - name: SYCL-CTS on OCL CPU PVC w/ LLVM SPIR-V Backend runner: '["Linux", "pvc"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: opencl:cpu - name: SYCL-CTS on L0 GPU PVC w/ LLVM SPIR-V Backend runner: '["Linux", "pvc"]' - image: ghcr.io/intel/llvm/ubuntu2404_intel_drivers:latest image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN target_devices: level_zero:gpu uses: ./.github/workflows/sycl-linux-run-tests.yml @@ -60,7 +58,6 @@ jobs: name: ${{ matrix.name }} runner: ${{ matrix.runner }} cts_testing_mode: 'run-only' - image: ${{ matrix.image }} image_options: ${{ matrix.image_options }} target_devices: ${{ matrix.target_devices }} tests_selector: cts diff --git a/sycl/test-e2e/DeprecatedFeatures/opencl_interop.cpp b/sycl/test-e2e/DeprecatedFeatures/opencl_interop.cpp index 755ecfb30d731..ab476c7e85320 100644 --- a/sycl/test-e2e/DeprecatedFeatures/opencl_interop.cpp +++ b/sycl/test-e2e/DeprecatedFeatures/opencl_interop.cpp @@ -1,7 +1,7 @@ -// REQUIRES: opencl, opencl_icd +// REQUIRES: any-device-is-opencl, opencl_icd, target-spir // RUN: %{build} -D__SYCL_INTERNAL_API -o %t.out %opencl_lib -// RUN: %{run} %t.out +// RUN: %{run-unfiltered-devices} %t.out // XFAIL: spirv-backend && cpu // XFAIL-TRACKER: CMPLRLLVM-64705 @@ -28,7 +28,14 @@ cl_platform_id selectOpenCLPlatform() { err = clGetPlatformIDs(num_of_platforms, &platforms[0], 0); CL_CHECK_ERRORS(err); - return platforms[0]; + for (int i = 0; i < num_of_platforms; ++i) { + cl_uint num_of_devices = 0; + err = + clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, 0, &num_of_devices); + if (err == CL_SUCCESS && num_of_devices > 0) + return platforms[i]; + } + throw std::runtime_error("No OpenCL platforms with available devices!"); } cl_device_id selectOpenCLDevice(cl_platform_id platform) { diff --git a/sycl/test-e2e/KernelCompiler/kernel_compiler_spirv.cpp b/sycl/test-e2e/KernelCompiler/kernel_compiler_spirv.cpp index 76f1a90bfa047..bf6a5201708b1 100644 --- a/sycl/test-e2e/KernelCompiler/kernel_compiler_spirv.cpp +++ b/sycl/test-e2e/KernelCompiler/kernel_compiler_spirv.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -// REQUIRES: ocloc +// REQUIRES: ocloc, target-spir // RUN: %{build} -o %t.out // RUN: %{run} %t.out %S/Kernels/kernels.spv %S/Kernels/kernels_fp16.spv %S/Kernels/kernels_fp64.spv diff --git a/sycl/test-e2e/Regression/device_num.cpp b/sycl/test-e2e/Regression/device_num.cpp index 50570d51c3d70..db8706d925d76 100644 --- a/sycl/test-e2e/Regression/device_num.cpp +++ b/sycl/test-e2e/Regression/device_num.cpp @@ -1,3 +1,6 @@ +// UNSUPPORTED: any-device-is-hip +// UNSUPPORTED-TRACKER: https://github.com/intel/llvm/issues/16805 + // RUN: %{build} -o %t.out // RUN: env PRINT_FULL_DEVICE_INFO=1 %{run-unfiltered-devices} %t.out > %t1.conf // RUN: env ONEAPI_DEVICE_SELECTOR="*:0" env TEST_DEV_CONFIG_FILE_NAME=%t1.conf %{run-unfiltered-devices} %t.out From 32abcd149000cf0e217642213fbfc7c6a15a4a57 Mon Sep 17 00:00:00 2001 From: Sergey Semenov Date: Tue, 28 Jan 2025 22:49:12 +0100 Subject: [PATCH 36/45] [SYCL][NFC] Fix flaky assert_in_multiple_tus behavior (#16799) The test expects that the first kernel hits an assertion and it never gets to the assertion in the second kernel. However, it was using an out-of-order queue, which meant that that behavior wasn't guaranteed. --- sycl/test-e2e/Assert/assert_in_multiple_tus.hpp | 3 ++- sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp | 4 ++-- .../test-e2e/Assert/assert_in_multiple_tus_one_ndebug_win.cpp | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sycl/test-e2e/Assert/assert_in_multiple_tus.hpp b/sycl/test-e2e/Assert/assert_in_multiple_tus.hpp index 585d75dfde7a0..eba14b45e54b8 100644 --- a/sycl/test-e2e/Assert/assert_in_multiple_tus.hpp +++ b/sycl/test-e2e/Assert/assert_in_multiple_tus.hpp @@ -3,6 +3,7 @@ #include #include +#include #ifdef DEFINE_NDEBUG_INFILE1 #define NDEBUG @@ -44,7 +45,7 @@ void enqueueKernel_1_fromFile1(queue *Q) { int main(int Argc, const char *Argv[]) { - queue Q; + queue Q({sycl::property::queue::in_order{}}); enqueueKernel_1_fromFile1(&Q); enqueueKernel_2_fromFile2(&Q); Q.wait(); diff --git a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp index 3b66660b8c2b5..267034f6fae33 100644 --- a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp +++ b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug.cpp @@ -16,10 +16,10 @@ // CHECK-NOT: this message from calculus // CUDA uses block/thread vs global/local id for SYCL, also it shows the // position of a thread within the block, not the absolute ID. -// CHECK: {{.*}}assert_in_multiple_tus.hpp:22: int checkFunction(): {{global id: \[5|block: \[1}},0,0], +// CHECK: {{.*}}assert_in_multiple_tus.hpp:23: int checkFunction(): {{global id: \[5|block: \[1}},0,0], // CHECK-SAME: {{.*}} [1,0,0] Assertion `X && "Nil in result"` failed // CHECK-NOT: this message from file2 // CHECK-NOT: The test ended. // -// CHECK-ACC-NOT: {{.*}}assert_in_multiple_tus.hpp:22: int checkFunction(): {{.*}} +// CHECK-ACC-NOT: {{.*}}assert_in_multiple_tus.hpp:23: int checkFunction(): {{.*}} // CHECK-ACC: The test ended. diff --git a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug_win.cpp b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug_win.cpp index 9e02e01681190..9537b977b560d 100644 --- a/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug_win.cpp +++ b/sycl/test-e2e/Assert/assert_in_multiple_tus_one_ndebug_win.cpp @@ -6,10 +6,10 @@ // CHECK-NOT: this message from calculus // FIXME Windows version prints '(null)' instead of '' once in a // while for some insane reason. -// CHECK: {{.*}}assert_in_multiple_tus.hpp:22: {{|\(null\)}}: {{.*}} [5,0,0], +// CHECK: {{.*}}assert_in_multiple_tus.hpp:23: {{|\(null\)}}: {{.*}} [5,0,0], // CHECK-SAME: {{.*}} [1,0,0] Assertion `X && "Nil in result"` failed. // CHECK-NOT: this message from file2 // CHECK-NOT: The test ended. // -// CHECK-ACC-NOT: {{.*}}assert_in_multiple_tus.hpp:22: {{|\(null\)}}: {{.*}} [5,0,0], +// CHECK-ACC-NOT: {{.*}}assert_in_multiple_tus.hpp:23: {{|\(null\)}}: {{.*}} [5,0,0], // CHECK-ACC: The test ended. From f7fc46dcd10d429fa559c5773380cfdc48e8e4a6 Mon Sep 17 00:00:00 2001 From: aelovikov-intel Date: Tue, 28 Jan 2025 14:44:27 -0800 Subject: [PATCH 37/45] [SYCL][E2E] Use "raw string" in `re.search(r"", ...)` (#16807) We've been getting ``` .../sycl/test-e2e/format.py:34: SyntaxWarning: invalid escape sequence '\.' win = re.search("win: *([0-9]{3}\.[0-9]{4})", line) ``` before this PR. --- sycl/test-e2e/format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py index 812fec75ab732..79dda30016d96 100644 --- a/sycl/test-e2e/format.py +++ b/sycl/test-e2e/format.py @@ -25,13 +25,13 @@ def parse_min_intel_driver_req(line_number, line, output): if not output: output = {} - lin = re.search("lin: *([0-9]{5})", line) + lin = re.search(r"lin: *([0-9]{5})", line) if lin: if "lin" in output: raise ValueError('Multiple entries for "lin" version') output["lin"] = int(lin.group(1)) - win = re.search("win: *([0-9]{3}\.[0-9]{4})", line) + win = re.search(r"win: *([0-9]{3}\.[0-9]{4})", line) if win: if "win" in output: raise ValueError('Multiple entries for "win" version') From 8324bb25e1590d46aa1adfdf7c117304cedf111e Mon Sep 17 00:00:00 2001 From: David Garcia Orozco Date: Tue, 28 Jan 2025 16:31:12 -0700 Subject: [PATCH 38/45] [SYCL][E2E] Add `InlineAsm/lit.local.cfg` to clean up `REQUIRES`/`UNSUPPORTED` (#16311) All test in this folder use `REQUIRES: gpu,linux` and `UNSUPPORTED: cuda, hip`. So using a lit.local.cfg file makes sense here. --- .../InlineAsm/Negative/asm_bad_opcode.cpp | 5 ++- .../Negative/asm_bad_operand_syntax.cpp | 5 ++- .../Negative/asm_duplicate_label.cpp | 5 ++- .../Negative/asm_illegal_exec_size.cpp | 5 ++- .../InlineAsm/Negative/asm_missing_label.cpp | 5 ++- .../InlineAsm/Negative/asm_missing_region.cpp | 5 ++- .../InlineAsm/Negative/asm_simple.cpp | 5 ++- .../InlineAsm/Negative/asm_undefined_decl.cpp | 5 ++- .../InlineAsm/Negative/asm_undefined_pred.cpp | 5 ++- .../InlineAsm/Negative/asm_wrong_declare.cpp | 5 ++- sycl/test-e2e/InlineAsm/asm_16_empty.cpp | 3 +- .../test-e2e/InlineAsm/asm_16_matrix_mult.cpp | 3 +- .../InlineAsm/asm_16_no_input_int.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_8_empty.cpp | 3 +- .../test-e2e/InlineAsm/asm_8_no_input_int.cpp | 3 +- .../InlineAsm/asm_arbitrary_ops_order.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_float_add.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_float_neg.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_if.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_imm_arg.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_loop.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_mul.cpp | 3 +- .../InlineAsm/asm_multiple_instructions.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_no_operands.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_no_output.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_plus_mod.cpp | 3 +- sycl/test-e2e/InlineAsm/asm_switch.cpp | 3 +- sycl/test-e2e/InlineAsm/letter_example.cpp | 3 +- sycl/test-e2e/InlineAsm/lit.local.cfg | 9 +++++ sycl/test-e2e/InlineAsm/malloc_shared_32.cpp | 3 +- .../InlineAsm/malloc_shared_in_out_dif.cpp | 3 +- .../InlineAsm/malloc_shared_no_input.cpp | 3 +- .../no-unsupported-without-info.cpp | 36 +------------------ 36 files changed, 54 insertions(+), 113 deletions(-) create mode 100644 sycl/test-e2e/InlineAsm/lit.local.cfg diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp index 49d6f9ae9331a..346cb2cf956a8 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_bad_opcode.cpp @@ -1,9 +1,8 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../include/asmhelper.h" +#include "include/asmhelper.h" #include struct KernelFunctor { diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp index 168b339c403b6..b02dde8cca5b0 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_bad_operand_syntax.cpp @@ -1,9 +1,8 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../include/asmhelper.h" +#include "include/asmhelper.h" #include struct KernelFunctor { diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp index 5b996fcc983f1..aa0a20f3c54de 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_duplicate_label.cpp @@ -1,9 +1,8 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../include/asmhelper.h" +#include "include/asmhelper.h" #include struct KernelFunctor { diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp index cd531c921b040..1394f711cdeb2 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_illegal_exec_size.cpp @@ -1,9 +1,8 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../include/asmhelper.h" +#include "include/asmhelper.h" #include struct KernelFunctor { diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp index b29d76aa5aec0..2e47f95737128 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_missing_label.cpp @@ -1,9 +1,8 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../include/asmhelper.h" +#include "include/asmhelper.h" #include struct KernelFunctor { diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp index 409692cc0f831..78dc02d12a589 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_missing_region.cpp @@ -1,9 +1,8 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../include/asmhelper.h" +#include "include/asmhelper.h" #include struct KernelFunctor { diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp index 325ee710ba873..e71569125c1ed 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_simple.cpp @@ -1,9 +1,8 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../include/asmhelper.h" +#include "include/asmhelper.h" #include struct KernelFunctor { diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp index 2674681931156..696f63a8fa6de 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_decl.cpp @@ -1,9 +1,8 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../include/asmhelper.h" +#include "include/asmhelper.h" #include struct KernelFunctor { diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp index d9bba38ef6936..d69d96175d380 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_undefined_pred.cpp @@ -1,9 +1,8 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../include/asmhelper.h" +#include "include/asmhelper.h" #include struct KernelFunctor { diff --git a/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp b/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp index 43e98db43c34e..cc68e80982d6d 100644 --- a/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp +++ b/sycl/test-e2e/InlineAsm/Negative/asm_wrong_declare.cpp @@ -1,9 +1,8 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out -#include "../include/asmhelper.h" +#include "include/asmhelper.h" #include struct KernelFunctor { diff --git a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp index 0bc3d9624f749..bffcc44e25d3a 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_empty.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_empty.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp b/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp index f92912919c786..88f8a384023df 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_matrix_mult.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp b/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp index f92912919c786..88f8a384023df 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_no_input_int.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp b/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp index 07286b34b8d4d..7a37b60edea73 100644 --- a/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp +++ b/sycl/test-e2e/InlineAsm/asm_16_no_opts.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp index a87704672680b..f784205bc6029 100644 --- a/sycl/test-e2e/InlineAsm/asm_8_empty.cpp +++ b/sycl/test-e2e/InlineAsm/asm_8_empty.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda -// REQUIRES: gpu,linux,sg-8 +// REQUIRES: sg-8 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp b/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp index 82dc6245a6770..7fba71b894bdb 100644 --- a/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp +++ b/sycl/test-e2e/InlineAsm/asm_8_no_input_int.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-8 +// REQUIRES: sg-8 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp b/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp index 9097a40131bbb..fcf5d5f3fcab2 100644 --- a/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp +++ b/sycl/test-e2e/InlineAsm/asm_arbitrary_ops_order.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp b/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp index a6a754289e533..676d09382a6b6 100644 --- a/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp +++ b/sycl/test-e2e/InlineAsm/asm_decl_in_scope.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_float_add.cpp b/sycl/test-e2e/InlineAsm/asm_float_add.cpp index f1d4d681b8edc..5b96b287232b9 100644 --- a/sycl/test-e2e/InlineAsm/asm_float_add.cpp +++ b/sycl/test-e2e/InlineAsm/asm_float_add.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp b/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp index b10aec8e47278..fa5f1378e7896 100644 --- a/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp +++ b/sycl/test-e2e/InlineAsm/asm_float_imm_arg.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_float_neg.cpp b/sycl/test-e2e/InlineAsm/asm_float_neg.cpp index 307a853fa407f..c9a249c24f12f 100644 --- a/sycl/test-e2e/InlineAsm/asm_float_neg.cpp +++ b/sycl/test-e2e/InlineAsm/asm_float_neg.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_if.cpp b/sycl/test-e2e/InlineAsm/asm_if.cpp index 54a679a0509ea..9225390cb9d50 100644 --- a/sycl/test-e2e/InlineAsm/asm_if.cpp +++ b/sycl/test-e2e/InlineAsm/asm_if.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp b/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp index 2506938a1bef8..a3e46e0ddae44 100644 --- a/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp +++ b/sycl/test-e2e/InlineAsm/asm_imm_arg.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_loop.cpp b/sycl/test-e2e/InlineAsm/asm_loop.cpp index eccc02ae1cab7..94b2400bfa611 100644 --- a/sycl/test-e2e/InlineAsm/asm_loop.cpp +++ b/sycl/test-e2e/InlineAsm/asm_loop.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_mul.cpp b/sycl/test-e2e/InlineAsm/asm_mul.cpp index df759b75d2a05..22b5b9abbf12d 100644 --- a/sycl/test-e2e/InlineAsm/asm_mul.cpp +++ b/sycl/test-e2e/InlineAsm/asm_mul.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp index 29b31e3d11ccd..e862af40b7767 100644 --- a/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp +++ b/sycl/test-e2e/InlineAsm/asm_multiple_instructions.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -DTO_PASS -o %t.out.pass // RUN: %{run} %t.out.pass // RUN: %{build} -o %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_no_operands.cpp b/sycl/test-e2e/InlineAsm/asm_no_operands.cpp index bf9df8ca40ae2..fd68892b6b7ed 100644 --- a/sycl/test-e2e/InlineAsm/asm_no_operands.cpp +++ b/sycl/test-e2e/InlineAsm/asm_no_operands.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_no_output.cpp b/sycl/test-e2e/InlineAsm/asm_no_output.cpp index 3a130f1e4b819..08153e72a9a77 100644 --- a/sycl/test-e2e/InlineAsm/asm_no_output.cpp +++ b/sycl/test-e2e/InlineAsm/asm_no_output.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp b/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp index 077446cf72859..df20dcbf2bc91 100644 --- a/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp +++ b/sycl/test-e2e/InlineAsm/asm_plus_mod.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/asm_switch.cpp b/sycl/test-e2e/InlineAsm/asm_switch.cpp index 4f96a55b554d2..ce94e912206f4 100644 --- a/sycl/test-e2e/InlineAsm/asm_switch.cpp +++ b/sycl/test-e2e/InlineAsm/asm_switch.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda || hip -// REQUIRES: gpu,linux,sg-16 +// REQUIRES: sg-16 // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/letter_example.cpp b/sycl/test-e2e/InlineAsm/letter_example.cpp index 50e5ff784446e..780e33d77e803 100644 --- a/sycl/test-e2e/InlineAsm/letter_example.cpp +++ b/sycl/test-e2e/InlineAsm/letter_example.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations +// REQUIRES: sg-16,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/lit.local.cfg b/sycl/test-e2e/InlineAsm/lit.local.cfg new file mode 100644 index 0000000000000..854d405afa90d --- /dev/null +++ b/sycl/test-e2e/InlineAsm/lit.local.cfg @@ -0,0 +1,9 @@ +config.required_features += ['gpu', 'linux', 'target-spir'] + +config.substitutions.append(("%helper-includes", "-I {}".format(os.path.dirname(os.path.abspath(__file__))))) +original_clangxx="" +for substitution in config.substitutions: + if substitution[0] == "%clangxx": + original_clangxx=substitution[1] +config.substitutions.insert(0, + ("%clangxx", original_clangxx + ' %helper-includes ')) diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp index 5271683548e72..db96f418927dd 100644 --- a/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp +++ b/sycl/test-e2e/InlineAsm/malloc_shared_32.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-32,aspect-usm_shared_allocations +// REQUIRES: sg-32,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp index 4751b94b6fd77..ac1419569f8fa 100644 --- a/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp +++ b/sycl/test-e2e/InlineAsm/malloc_shared_in_out_dif.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations +// REQUIRES: sg-16,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp b/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp index 41d5181f37092..2b617e1acdbd8 100644 --- a/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp +++ b/sycl/test-e2e/InlineAsm/malloc_shared_no_input.cpp @@ -1,5 +1,4 @@ -// UNSUPPORTED: cuda, hip -// REQUIRES: gpu,linux,sg-16,aspect-usm_shared_allocations +// REQUIRES: sg-16,aspect-usm_shared_allocations // RUN: %{build} -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp index be4b7933c7813..f95323d179fb0 100644 --- a/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp +++ b/sycl/test/e2e_test_requirements/no-unsupported-without-info.cpp @@ -54,7 +54,7 @@ // tests to match the required format and in that case you should just update // (i.e. reduce) the number and the list below. // -// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 409 +// NUMBER-OF-UNSUPPORTED-WITHOUT-INFO: 375 // // List of improperly UNSUPPORTED tests. // Remove the CHECK once the test has been properly UNSUPPORTED. @@ -233,40 +233,6 @@ // CHECK-NEXT: HierPar/hier_par_wgscope.cpp // CHECK-NEXT: HostInteropTask/host-task-failure.cpp // CHECK-NEXT: HostInteropTask/interop-task.cpp -// CHECK-NEXT: InlineAsm/Negative/asm_bad_opcode.cpp -// CHECK-NEXT: InlineAsm/Negative/asm_bad_operand_syntax.cpp -// CHECK-NEXT: InlineAsm/Negative/asm_duplicate_label.cpp -// CHECK-NEXT: InlineAsm/Negative/asm_illegal_exec_size.cpp -// CHECK-NEXT: InlineAsm/Negative/asm_missing_label.cpp -// CHECK-NEXT: InlineAsm/Negative/asm_missing_region.cpp -// CHECK-NEXT: InlineAsm/Negative/asm_simple.cpp -// CHECK-NEXT: InlineAsm/Negative/asm_undefined_decl.cpp -// CHECK-NEXT: InlineAsm/Negative/asm_undefined_pred.cpp -// CHECK-NEXT: InlineAsm/Negative/asm_wrong_declare.cpp -// CHECK-NEXT: InlineAsm/asm_16_empty.cpp -// CHECK-NEXT: InlineAsm/asm_16_matrix_mult.cpp -// CHECK-NEXT: InlineAsm/asm_16_no_input_int.cpp -// CHECK-NEXT: InlineAsm/asm_16_no_opts.cpp -// CHECK-NEXT: InlineAsm/asm_8_empty.cpp -// CHECK-NEXT: InlineAsm/asm_8_no_input_int.cpp -// CHECK-NEXT: InlineAsm/asm_arbitrary_ops_order.cpp -// CHECK-NEXT: InlineAsm/asm_decl_in_scope.cpp -// CHECK-NEXT: InlineAsm/asm_float_add.cpp -// CHECK-NEXT: InlineAsm/asm_float_imm_arg.cpp -// CHECK-NEXT: InlineAsm/asm_float_neg.cpp -// CHECK-NEXT: InlineAsm/asm_if.cpp -// CHECK-NEXT: InlineAsm/asm_imm_arg.cpp -// CHECK-NEXT: InlineAsm/asm_loop.cpp -// CHECK-NEXT: InlineAsm/asm_mul.cpp -// CHECK-NEXT: InlineAsm/asm_multiple_instructions.cpp -// CHECK-NEXT: InlineAsm/asm_no_operands.cpp -// CHECK-NEXT: InlineAsm/asm_no_output.cpp -// CHECK-NEXT: InlineAsm/asm_plus_mod.cpp -// CHECK-NEXT: InlineAsm/asm_switch.cpp -// CHECK-NEXT: InlineAsm/letter_example.cpp -// CHECK-NEXT: InlineAsm/malloc_shared_32.cpp -// CHECK-NEXT: InlineAsm/malloc_shared_in_out_dif.cpp -// CHECK-NEXT: InlineAsm/malloc_shared_no_input.cpp // CHECK-NEXT: InvokeSimd/Feature/ImplicitSubgroup/SPMD_invoke_ESIMD_external.cpp // CHECK-NEXT: InvokeSimd/Feature/ImplicitSubgroup/popcnt.cpp // CHECK-NEXT: InvokeSimd/Feature/popcnt.cpp From e8564403232c13b3716aa68190084880f2a98a6c Mon Sep 17 00:00:00 2001 From: David Garcia Orozco Date: Tue, 28 Jan 2025 16:37:29 -0700 Subject: [PATCH 39/45] [SYCL][E2E] Remove `REQUIRES: build-and-run-mode` from `syclcompat` and `Adapters` tests (#16795) As of #16725 tests can properly react to features that affect compilation on build-only mode (i.e., libraries or OS), additionally we can also mark if a test should only be built for a specific triple using the `target-*` features. This pr removes `REQUIRES: build-and-run-mode` from syclcompat and Adapters tests, and either lets the test be marked as unsupported due to requiring a missing build feature (`windows` or `cuda_dev_kit`), or the test is marked as unsupported for `spir` by requiring `target-nvidia` --- sycl/test-e2e/Adapters/cuda_queue_priority.cpp | 1 - sycl/test-e2e/Adapters/dll-detach-order.cpp | 1 - sycl/test-e2e/syclcompat/kernel/kernel_win.cpp | 1 - sycl/test-e2e/syclcompat/launch/launch_properties.cpp | 5 +++-- .../syclcompat/memory/local_memory_ptr_to_integer.cpp | 3 +-- 5 files changed, 4 insertions(+), 7 deletions(-) diff --git a/sycl/test-e2e/Adapters/cuda_queue_priority.cpp b/sycl/test-e2e/Adapters/cuda_queue_priority.cpp index 008637b91f3e2..031dc252c578f 100644 --- a/sycl/test-e2e/Adapters/cuda_queue_priority.cpp +++ b/sycl/test-e2e/Adapters/cuda_queue_priority.cpp @@ -1,5 +1,4 @@ // REQUIRES: gpu, cuda, cuda_dev_kit -// REQUIRES: build-and-run-mode // RUN: %{build} %cuda_options -o %t.out // RUN: %{run} %t.out // diff --git a/sycl/test-e2e/Adapters/dll-detach-order.cpp b/sycl/test-e2e/Adapters/dll-detach-order.cpp index 72d014eb066bb..c1cf32816a240 100644 --- a/sycl/test-e2e/Adapters/dll-detach-order.cpp +++ b/sycl/test-e2e/Adapters/dll-detach-order.cpp @@ -1,5 +1,4 @@ // REQUIRES: windows -// REQUIRES: build-and-run-mode // RUN: env SYCL_UR_TRACE=-1 sycl-ls | FileCheck %s // ensure that the adapters are detached AFTER urLoaderTearDown is done diff --git a/sycl/test-e2e/syclcompat/kernel/kernel_win.cpp b/sycl/test-e2e/syclcompat/kernel/kernel_win.cpp index 85ecf5687ca63..53ce174a8b7c0 100644 --- a/sycl/test-e2e/syclcompat/kernel/kernel_win.cpp +++ b/sycl/test-e2e/syclcompat/kernel/kernel_win.cpp @@ -1,5 +1,4 @@ // REQUIRES: windows -// REQUIRES: build-and-run-mode // DEFINE: %{sharedflag} = %if cl_options %{/clang:-shared%} %else %{-shared%} diff --git a/sycl/test-e2e/syclcompat/launch/launch_properties.cpp b/sycl/test-e2e/syclcompat/launch/launch_properties.cpp index 5d07ad256b328..19176388b33e8 100644 --- a/sycl/test-e2e/syclcompat/launch/launch_properties.cpp +++ b/sycl/test-e2e/syclcompat/launch/launch_properties.cpp @@ -22,8 +22,9 @@ * sycl/test-e2e/ClusterLaunch/cluster_launch_parallel_for.cpp **************************************************************************/ -// REQUIRES: aspect-ext_oneapi_cuda_cluster_group -// REQUIRES: build-and-run-mode +// REQUIRES: target-nvidia, aspect-ext_oneapi_cuda_cluster_group +// XFAIL: * +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16794 // RUN: %{build} -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_90 -o %t.out // RUN: %{run} %t.out diff --git a/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp b/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp index edd9e63a3752a..9d22804309a2c 100644 --- a/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp +++ b/sycl/test-e2e/syclcompat/memory/local_memory_ptr_to_integer.cpp @@ -1,5 +1,4 @@ -// REQUIRES: cuda -// REQUIRES: build-and-run-mode +// REQUIRES: target-nvidia // RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_75 -o %t.out // RUN: %{run} %t.out #include From 5afb184bf595ce3f49781a991201a9d279602a43 Mon Sep 17 00:00:00 2001 From: Steffen Larsen Date: Wed, 29 Jan 2025 11:09:02 +0100 Subject: [PATCH 40/45] [SYCL][Docs] Rename restrict property to unaliased (#16814) This commit renames the `restrict` kernel argument property to `unaliased` to avoid conflicting with common code patterns in C/C++ projects. --------- Signed-off-by: Larsen, Steffen --- clang/lib/CodeGen/CGCall.cpp | 2 +- .../CodeGenSYCL/sycl_restrict_property.cpp | 79 ------------------- .../CodeGenSYCL/sycl_unaliased_property.cpp | 79 +++++++++++++++++++ ..._ext_oneapi_kernel_arg_properties.asciidoc | 20 ++--- .../properties.hpp | 18 ++--- .../sycl/ext/oneapi/properties/property.hpp | 2 +- ...strict.cpp => annotated_arg_unaliased.cpp} | 4 +- ...strict.cpp => annotated_ptr_unaliased.cpp} | 6 +- .../{restrict.cpp => unaliased.cpp} | 4 +- .../{restrict.cpp => unaliased.cpp} | 4 +- 10 files changed, 109 insertions(+), 109 deletions(-) delete mode 100644 clang/test/CodeGenSYCL/sycl_restrict_property.cpp create mode 100644 clang/test/CodeGenSYCL/sycl_unaliased_property.cpp rename sycl/test-e2e/Annotated_arg_ptr/{annotated_arg_restrict.cpp => annotated_arg_unaliased.cpp} (80%) rename sycl/test-e2e/Annotated_arg_ptr/{annotated_ptr_restrict.cpp => annotated_ptr_unaliased.cpp} (87%) rename sycl/test/check_device_code/extensions/annotated_arg/{restrict.cpp => unaliased.cpp} (86%) rename sycl/test/check_device_code/extensions/annotated_ptr/{restrict.cpp => unaliased.cpp} (95%) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index ba8e446bd9a8a..79bfd6b31a009 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -3030,7 +3030,7 @@ static bool hasSYCLRestrictPropertyIRAttr(const VarDecl *Arg, return std::any_of( NameValuePairs.begin(), NameValuePairs.end(), [](const std::pair &NameValuePair) { - return NameValuePair.first == "sycl-restrict"; + return NameValuePair.first == "sycl-unaliased"; }); } diff --git a/clang/test/CodeGenSYCL/sycl_restrict_property.cpp b/clang/test/CodeGenSYCL/sycl_restrict_property.cpp deleted file mode 100644 index b62cac27159a6..0000000000000 --- a/clang/test/CodeGenSYCL/sycl_restrict_property.cpp +++ /dev/null @@ -1,79 +0,0 @@ -// RUN: %clang_cc1 -fsycl-is-device %s -emit-llvm -triple spir64-unknown-unknown -o - | FileCheck %s - -struct __attribute__((sycl_special_class)) - [[__sycl_detail__::sycl_type(annotated_arg)]] - AnnotatedIntPtr { - void __init([[__sycl_detail__::add_ir_attributes_kernel_parameter( - "sycl-restrict", nullptr)]] - __attribute__((opencl_global)) int* InPtr) { - Ptr = InPtr; - } - - int &operator[](unsigned I) const { return Ptr[I]; } - - __attribute__((opencl_global)) int *Ptr; -}; - -template -__attribute__((sycl_kernel)) void kernel(const Func &kernelFunc) { - kernelFunc(); -} - -int main() { - { - int *a; - int *b; - int *c; - kernel([a, b, c]() { c[0] = a[0] + b[0]; }); - // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_norestrict(ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}) - } - { - AnnotatedIntPtr a; - int *b; - int *c; - kernel([a, b, c]() { c[0] = a[0] + b[0]; }); - // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_restrict1(ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}) - } - { - int *a; - AnnotatedIntPtr b; - int *c; - kernel([a, b, c]() { c[0] = a[0] + b[0]; }); - // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_restrict2(ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}) - } - { - int *a; - int *b; - AnnotatedIntPtr c; - kernel([a, b, c]() { c[0] = a[0] + b[0]; }); - // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_restrict3(ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}) - } - { - AnnotatedIntPtr a; - AnnotatedIntPtr b; - int *c; - kernel([a, b, c]() { c[0] = a[0] + b[0]; }); - // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_restrict4(ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}) - } - { - AnnotatedIntPtr a; - int *b; - AnnotatedIntPtr c; - kernel([a, b, c]() { c[0] = a[0] + b[0]; }); - // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_restrict5(ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}) - } - { - int *a; - AnnotatedIntPtr b; - AnnotatedIntPtr c; - kernel([a, b, c]() { c[0] = a[0] + b[0]; }); - // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_restrict6(ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}) - } - { - AnnotatedIntPtr a; - AnnotatedIntPtr b; - AnnotatedIntPtr c; - kernel([a, b, c]() { c[0] = a[0] + b[0]; }); - // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_restrict7(ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %{{.*}}) - } -} diff --git a/clang/test/CodeGenSYCL/sycl_unaliased_property.cpp b/clang/test/CodeGenSYCL/sycl_unaliased_property.cpp new file mode 100644 index 0000000000000..f5ad55ae4f464 --- /dev/null +++ b/clang/test/CodeGenSYCL/sycl_unaliased_property.cpp @@ -0,0 +1,79 @@ +// RUN: %clang_cc1 -fsycl-is-device %s -emit-llvm -triple spir64-unknown-unknown -o - | FileCheck %s + +struct __attribute__((sycl_special_class)) + [[__sycl_detail__::sycl_type(annotated_arg)]] + AnnotatedIntPtr { + void __init([[__sycl_detail__::add_ir_attributes_kernel_parameter( + "sycl-unaliased", nullptr)]] + __attribute__((opencl_global)) int* InPtr) { + Ptr = InPtr; + } + + int &operator[](unsigned I) const { return Ptr[I]; } + + __attribute__((opencl_global)) int *Ptr; +}; + +template +__attribute__((sycl_kernel)) void kernel(const Func &kernelFunc) { + kernelFunc(); +} + +int main() { + { + int *a; + int *b; + int *c; + kernel([a, b, c]() { c[0] = a[0] + b[0]; }); + // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_nounaliased(ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}) + } + { + AnnotatedIntPtr a; + int *b; + int *c; + kernel([a, b, c]() { c[0] = a[0] + b[0]; }); + // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_unaliased1(ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}) + } + { + int *a; + AnnotatedIntPtr b; + int *c; + kernel([a, b, c]() { c[0] = a[0] + b[0]; }); + // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_unaliased2(ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}) + } + { + int *a; + int *b; + AnnotatedIntPtr c; + kernel([a, b, c]() { c[0] = a[0] + b[0]; }); + // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_unaliased3(ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}) + } + { + AnnotatedIntPtr a; + AnnotatedIntPtr b; + int *c; + kernel([a, b, c]() { c[0] = a[0] + b[0]; }); + // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_unaliased4(ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}) + } + { + AnnotatedIntPtr a; + int *b; + AnnotatedIntPtr c; + kernel([a, b, c]() { c[0] = a[0] + b[0]; }); + // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_unaliased5(ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}, ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}) + } + { + int *a; + AnnotatedIntPtr b; + AnnotatedIntPtr c; + kernel([a, b, c]() { c[0] = a[0] + b[0]; }); + // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_unaliased6(ptr addrspace(1) noundef align 4 %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}) + } + { + AnnotatedIntPtr a; + AnnotatedIntPtr b; + AnnotatedIntPtr c; + kernel([a, b, c]() { c[0] = a[0] + b[0]; }); + // CHECK-DAG: define {{.*}}spir_kernel {{.*}}kernel_unaliased7(ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}, ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %{{.*}}) + } +} diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_arg_properties.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_arg_properties.asciidoc index 4334aa9938cd0..8f135d03c2be1 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_arg_properties.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_arg_properties.asciidoc @@ -88,9 +88,9 @@ implementation supports. |Initial version of this extension. |=== -=== `restrict` property +=== `unaliased` property -The `restrict` property defined here is only meaningful on the kernel arguments +The `unaliased` property defined here is only meaningful on the kernel arguments when the kernel argument is a pointer type. It is ignored for other types. This property is not meaningful within the kernel body. @@ -98,19 +98,19 @@ This property is not meaningful within the kernel body. ```c++ namespace sycl::ext::oneapi::experimental { -struct restrict_key { - using value_t = property_value; +struct unaliased_key { + using value_t = property_value; }; -inline constexpr restrict_key::value_t restrict; +inline constexpr unaliased_key::value_t unaliased; template struct is_property_key_of< - restrict_key, annotated_ptr> : std::true_type {}; + unaliased_key, annotated_ptr> : std::true_type {}; template struct is_property_key_of< - restrict_key, annotated_arg> : std::true_type {}; + unaliased_key, annotated_arg> : std::true_type {}; } // namespace sycl::ext::oneapi::experimental ``` === `alignment` property @@ -152,7 +152,7 @@ struct is_property_key_of< a| [source,c++] ---- -restrict +unaliased ---- a| This is an assertion by the application that the pointer kernel arguments marked @@ -195,8 +195,8 @@ using sycl::ext::oneapi::experimental; int* ptr_b = ...; // Add properties - auto arg_a = annotated_ptr(ptr_a, properties{restrict, alignment<32>}); - auto arg_n = annotated_arg(ptr_b, properties{restrict}); + auto arg_a = annotated_ptr(ptr_a, properties{unaliased, alignment<32>}); + auto arg_n = annotated_arg(ptr_b, properties{unaliased}); ... q.single_task([=] { diff --git a/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp b/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp index 04258b9280fb1..852e886c2f228 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp @@ -66,12 +66,12 @@ struct propagateToPtrAnnotation> //===----------------------------------------------------------------------===// // Common properties of annotated_arg/annotated_ptr //===----------------------------------------------------------------------===// -struct restrict_key - : detail::compile_time_property_key { - using value_t = property_value; +struct unaliased_key + : detail::compile_time_property_key { + using value_t = property_value; }; -inline constexpr restrict_key::value_t restrict; +inline constexpr unaliased_key::value_t unaliased; struct alignment_key : detail::compile_time_property_key { @@ -82,7 +82,7 @@ struct alignment_key template inline constexpr alignment_key::value_t alignment; template -struct is_valid_property +struct is_valid_property : std::bool_constant::value> {}; template @@ -90,7 +90,7 @@ struct is_valid_property> : std::bool_constant::value> {}; template -struct is_property_key_of> +struct is_property_key_of> : std::true_type {}; template @@ -102,7 +102,7 @@ struct is_property_key_of> : std::true_type {}; template -struct is_property_key_of> +struct is_property_key_of> : std::true_type {}; template <> struct propagateToPtrAnnotation : std::true_type {}; @@ -113,8 +113,8 @@ template struct PropertyMetaInfo> { static constexpr int value = N; }; -template <> struct PropertyMetaInfo { - static constexpr const char *name = "sycl-restrict"; +template <> struct PropertyMetaInfo { + static constexpr const char *name = "sycl-unaliased"; static constexpr std::nullptr_t value = nullptr; }; diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp index 8de51110e7089..9fd7752889f4f 100644 --- a/sycl/include/sycl/ext/oneapi/properties/property.hpp +++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp @@ -222,7 +222,7 @@ enum PropKind : uint32_t { Deterministic = 77, InitializeToIdentity = 78, WorkGroupScratchSize = 79, - Restrict = 80, + Unaliased = 80, EventMode = 81, NativeLocalBlockIO = 82, // PropKindSize must always be the last value. diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg_restrict.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg_unaliased.cpp similarity index 80% rename from sycl/test-e2e/Annotated_arg_ptr/annotated_arg_restrict.cpp rename to sycl/test-e2e/Annotated_arg_ptr/annotated_arg_unaliased.cpp index eefaa25c66286..774757e652472 100644 --- a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg_restrict.cpp +++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg_unaliased.cpp @@ -2,7 +2,7 @@ // RUN: %{run} %t.out // REQUIRES: aspect-usm_shared_allocations -// Checks that restrict annotated_arg works in device code. +// Checks that unaliased annotated_arg works in device code. #include #include @@ -15,7 +15,7 @@ int main() { int *Ptr = sycl::malloc_shared(1, Q); syclexp::annotated_arg + decltype(syclexp::properties(syclexp::unaliased))> AnnotArg{Ptr}; Q.submit([&](sycl::handler &CGH) { CGH.single_task([=]() { *AnnotArg = 42; }); diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr_restrict.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr_unaliased.cpp similarity index 87% rename from sycl/test-e2e/Annotated_arg_ptr/annotated_ptr_restrict.cpp rename to sycl/test-e2e/Annotated_arg_ptr/annotated_ptr_unaliased.cpp index 42343b6986ae6..f989bf741995e 100644 --- a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr_restrict.cpp +++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr_unaliased.cpp @@ -2,7 +2,7 @@ // RUN: %{run} %t.out // REQUIRES: aspect-usm_shared_allocations -// Checks that restrict annotated_ptr works in device code. +// Checks that unaliased annotated_ptr works in device code. #include #include @@ -14,7 +14,7 @@ int main() { sycl::queue Q; auto Ptr = sycl::malloc_shared(1, Q); - syclexp::annotated_ptr + syclexp::annotated_ptr AnnotPtr{Ptr}; Q.submit([&](sycl::handler &CGH) { CGH.single_task([=]() { *AnnotPtr = 42; }); @@ -25,4 +25,4 @@ int main() { return 0; } -// CHECK-IR: spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlvE_(ptr addrspace(1) noalias noundef align 4 "sycl-restrict" %_arg_AnnotPtr) +// CHECK-IR: spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlvE_(ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" %_arg_AnnotPtr) diff --git a/sycl/test/check_device_code/extensions/annotated_arg/restrict.cpp b/sycl/test/check_device_code/extensions/annotated_arg/unaliased.cpp similarity index 86% rename from sycl/test/check_device_code/extensions/annotated_arg/restrict.cpp rename to sycl/test/check_device_code/extensions/annotated_arg/unaliased.cpp index 7e234e244815c..028799fbe5214 100644 --- a/sycl/test/check_device_code/extensions/annotated_arg/restrict.cpp +++ b/sycl/test/check_device_code/extensions/annotated_arg/unaliased.cpp @@ -9,7 +9,7 @@ int main() { auto Ptr = sycl::malloc_shared(1, Q); syclexp::annotated_arg + decltype(syclexp::properties(syclexp::unaliased))> AnnotArg{Ptr}; Q.submit([&](sycl::handler &CGH) { CGH.single_task([=]() { *AnnotArg = 42; }); @@ -19,4 +19,4 @@ int main() { return 0; } -// CHECK-IR: spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlvE_(ptr addrspace(1) noalias noundef align 4 "sycl-restrict" +// CHECK-IR: spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlvE_(ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" diff --git a/sycl/test/check_device_code/extensions/annotated_ptr/restrict.cpp b/sycl/test/check_device_code/extensions/annotated_ptr/unaliased.cpp similarity index 95% rename from sycl/test/check_device_code/extensions/annotated_ptr/restrict.cpp rename to sycl/test/check_device_code/extensions/annotated_ptr/unaliased.cpp index 8b27ee83827c6..cf99a03179cb2 100644 --- a/sycl/test/check_device_code/extensions/annotated_ptr/restrict.cpp +++ b/sycl/test/check_device_code/extensions/annotated_ptr/unaliased.cpp @@ -8,7 +8,7 @@ int main() { sycl::queue Q; auto Ptr = sycl::malloc_shared(1, Q); - syclexp::annotated_ptr + syclexp::annotated_ptr AnnotPtr{Ptr}; Q.submit([&](sycl::handler &CGH) { CGH.single_task([=]() { *AnnotPtr = 42; }); @@ -18,4 +18,4 @@ int main() { return 0; } -// CHECK-IR: spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlvE_(ptr addrspace(1) noalias noundef align 4 "sycl-restrict" +// CHECK-IR: spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlvE_(ptr addrspace(1) noalias noundef align 4 "sycl-unaliased" From 0b3f4e7df6966073e753b658da9526103bbd8b84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9sz=C3=A1ros=20Gergely?= Date: Wed, 29 Jan 2025 11:14:02 +0100 Subject: [PATCH 41/45] [SYCL][CMake][MSVC] Fix link.exe /OPT detection (#16811) CMake's check_linker_flag does no split flags by spaces, so the current call passes the single option `"/OPT:REF LINKER:/OPT:ICF"` with a space in it to link.exe. (The first `LINKER:` prefix is parsed). This was also broken before ede906ce6cb425718638091322f425b84676047f ([CMake][MSVC] Wrap more Linker flags for ICX (#16284)), where it would pass `"/OPT:REF /OPT:ICF"` as a single option. This results in the check failing and so the build does not ever enable these flags, even though they would be supported if the check was correct. Use comma as the separator as supported by the `LINKER:` syntax to fix it. --- sycl/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt index cdc84bc122b57..ace416dc70d8a 100644 --- a/sycl/CMakeLists.txt +++ b/sycl/CMakeLists.txt @@ -75,7 +75,7 @@ if(MSVC) add_link_options("LINKER:/DEBUG") # Enable unreferenced removal and ICF in Release mode. - check_linker_flag(CXX "LINKER:/OPT:REF LINKER:/OPT:ICF" LINKER_SUPPORTS_OPTS) + check_linker_flag(CXX "LINKER:/OPT:REF,/OPT:ICF" LINKER_SUPPORTS_OPTS) if (LINKER_SUPPORTS_OPTS AND uppercase_CMAKE_BUILD_TYPE STREQUAL "RELEASE") add_link_options("LINKER:/OPT:REF" "LINKER:/OPT:ICF") endif() From 035fb0959c5e6da9aa42d3da1ea4f11b5be65d64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A9sz=C3=A1ros=20Gergely?= Date: Wed, 29 Jan 2025 11:14:18 +0100 Subject: [PATCH 42/45] [SYCL][NFC] Helper for implicit kernarg targets in LowerWGLocalMemory (#16766) To make the code more explicit and allow for future expansion. --- llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp | 31 ++++++++++++--------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp index 6c46b5c75d5d7..f69dd83be4157 100644 --- a/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp +++ b/llvm/lib/SYCLLowerIR/LowerWGLocalMemory.cpp @@ -49,12 +49,16 @@ class SYCLLowerWGLocalMemoryLegacy : public ModulePass { }; } // namespace +static bool usesKernelArgForDynWGLocalMem(const Triple &TT) { + return TT.isSPIROrSPIRV(); +} + std::vector> sycl::getKernelNamesUsingImplicitLocalMem(const Module &M) { std::vector> SPIRKernelNames; Triple TT(M.getTargetTriple()); - if (TT.isSPIROrSPIRV()) { + if (usesKernelArgForDynWGLocalMem(TT)) { auto GetArgumentPos = [&](const Function &F) -> int { for (const Argument &Arg : F.args()) if (F.getAttributes().hasParamAttr(Arg.getArgNo(), @@ -129,7 +133,7 @@ lowerDynamicLocalMemCallDirect(CallInst *CI, Triple TT, Value *GVPtr = [&]() -> Value * { IRBuilder<> Builder(CI); - if (TT.isSPIROrSPIRV()) + if (usesKernelArgForDynWGLocalMem(TT)) return Builder.CreateLoad(CI->getType(), LocalMemPlaceholder); return Builder.CreatePointerCast(LocalMemPlaceholder, CI->getType()); @@ -188,7 +192,7 @@ static bool dynamicWGLocalMemory(Module &M) { if (!LocalMemArrayGV) { assert(DLMFunc->isDeclaration() && "should have declaration only"); Type *LocalMemArrayTy = - TT.isSPIROrSPIRV() + usesKernelArgForDynWGLocalMem(TT) ? static_cast(PointerType::get(M.getContext(), LocalAS)) : static_cast( ArrayType::get(Type::getInt8Ty(M.getContext()), 0)); @@ -196,24 +200,25 @@ static bool dynamicWGLocalMemory(Module &M) { M, // module LocalMemArrayTy, // type false, // isConstant - TT.isSPIROrSPIRV() ? GlobalValue::LinkOnceODRLinkage - : GlobalValue::ExternalLinkage, // Linkage - TT.isSPIROrSPIRV() ? UndefValue::get(LocalMemArrayTy) - : nullptr, // Initializer - DYNAMIC_LOCALMEM_GV, // Name prefix - nullptr, // InsertBefore - GlobalVariable::NotThreadLocal, // ThreadLocalMode - LocalAS // AddressSpace + usesKernelArgForDynWGLocalMem(TT) + ? GlobalValue::LinkOnceODRLinkage + : GlobalValue::ExternalLinkage, // Linkage + usesKernelArgForDynWGLocalMem(TT) ? UndefValue::get(LocalMemArrayTy) + : nullptr, // Initializer + DYNAMIC_LOCALMEM_GV, // Name prefix + nullptr, // InsertBefore + GlobalVariable::NotThreadLocal, // ThreadLocalMode + LocalAS // AddressSpace ); LocalMemArrayGV->setUnnamedAddr(GlobalVariable::UnnamedAddr::Local); constexpr int DefaultMaxAlignment = 128; - if (!TT.isSPIROrSPIRV()) + if (!usesKernelArgForDynWGLocalMem(TT)) LocalMemArrayGV->setAlignment(Align{DefaultMaxAlignment}); } lowerLocalMemCall(DLMFunc, [&](CallInst *CI) { lowerDynamicLocalMemCallDirect(CI, TT, LocalMemArrayGV); }); - if (TT.isSPIROrSPIRV()) { + if (usesKernelArgForDynWGLocalMem(TT)) { SmallVector Kernels; llvm::for_each(M.functions(), [&](Function &F) { if (F.getCallingConv() == CallingConv::SPIR_KERNEL && From d142923d2a61302593f1cbf4bc6265be0a0c4de2 Mon Sep 17 00:00:00 2001 From: "Kenneth Benzie (Benie)" Date: Wed, 29 Jan 2025 10:54:57 +0000 Subject: [PATCH 43/45] [UR][CL] Fix invalid use of dlopen() (#16736) https://github.com/oneapi-src/unified-runtime/pull/2594 --- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index 85ea961d9468e..08fa4254ba96c 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,7 +1,7 @@ -# commit 78e1b33271d28d26845a4bfae7ae3b72c14e0e63 -# Merge: 902bb2e2 94b32ac2 -# Author: Ross Brunton -# Date: Tue Jan 28 11:02:44 2025 +0000 -# Merge pull request #2624 from RossBrunton/ross/msanfix -# Assert that Device is valid for memory poisoning -set(UNIFIED_RUNTIME_TAG 78e1b33271d28d26845a4bfae7ae3b72c14e0e63) +# commit 3a1b4c7b9ba952fad6f6ad36c01101bbf368347b +# Merge: c270a6b8 264d0468 +# Author: Kenneth Benzie (Benie) +# Date: Tue Jan 28 15:16:58 2025 +0000 +# Merge pull request #2594 from kbenzie/benie/cl-core-functions-no-dlopen +# Fix invalid use of dlopen() +set(UNIFIED_RUNTIME_TAG 3a1b4c7b9ba952fad6f6ad36c01101bbf368347b) From 11a73e7d37e4b1000123722c66df9257b5a5731d Mon Sep 17 00:00:00 2001 From: Omar Ahmed Date: Wed, 29 Jan 2025 10:55:27 +0000 Subject: [PATCH 44/45] Pass -foffload-lto instead of -flto for cuda/hip kernels in clangLinkerWrapper (#16605) ClangLinkerWrapper tool in one of its clang commands to generate ptx kernel binary from llvm bitcode kernel was using `-flto` option which should be only used for cpu code not gpu kernel code. This PR fixes that by changing that to `-foffload-lto` for cuda/hip kernels. This fixes [16413](https://github.com/intel/llvm/issues/16413) issue. --- clang/test/Driver/linker-wrapper.c | 18 +++++++++--------- .../ClangLinkerWrapper.cpp | 7 ++++++- sycl/test-e2e/NewOffloadDriver/multisource.cpp | 2 -- .../NewOffloadDriver/split-per-source-main.cpp | 3 --- .../sycl-external-with-optional-features.cpp | 2 -- 5 files changed, 15 insertions(+), 17 deletions(-) diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c index 71dabbc602856..c6ad270979cd1 100644 --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -21,7 +21,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK -// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -flto {{.*}}.o {{.*}}.o +// NVPTX-LINK: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -foffload-lto {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \ @@ -30,7 +30,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --device-debug -O0 \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=NVPTX-LINK-DEBUG -// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -flto {{.*}}.o {{.*}}.o -g +// NVPTX-LINK-DEBUG: clang{{.*}} -o {{.*}}.img --target=nvptx64-nvidia-cuda -march=sm_70 -O2 -foffload-lto {{.*}}.o {{.*}}.o -g // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \ @@ -39,7 +39,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LINK -// AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o +// AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -foffload-lto -Wl,--no-undefined {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 \ @@ -48,7 +48,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --save-temps -O2 \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LTO-TEMPS -// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -flto -Wl,--no-undefined {{.*}} -save-temps +// AMDGPU-LTO-TEMPS: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx1030 -O2 -foffload-lto -Wl,--no-undefined {{.*}} -save-temps // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \ @@ -148,7 +148,7 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run --clang-backend \ // RUN: --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CLANG-BACKEND -// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o +// CLANG-BACKEND: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -foffload-lto -Wl,--no-undefined {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 @@ -171,8 +171,8 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t-on.o %t-off.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=AMD-TARGET-ID -// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o -// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o +// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -foffload-lto -Wl,--no-undefined {{.*}}.o {{.*}}.o +// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -foffload-lto -Wl,--no-undefined {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t-lib.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=generic @@ -187,8 +187,8 @@ __attribute__((visibility("protected"), used)) int x; // RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ // RUN: --linker-path=/usr/bin/ld %t1.o %t2.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=ARCH-ALL -// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o -// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -flto -Wl,--no-undefined {{.*}}.o {{.*}}.o +// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -foffload-lto -Wl,--no-undefined {{.*}}.o {{.*}}.o +// ARCH-ALL: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -foffload-lto -Wl,--no-undefined {{.*}}.o {{.*}}.o // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.elf.o,kind=openmp,triple=x86_64-unknown-linux-gnu \ diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp index dbc04e69a4c26..8ea4f3ef87558 100644 --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -1533,12 +1533,17 @@ Expected clang(ArrayRef InputFiles, const ArgList &Args, }; // Forward all of the `--offload-opt` and similar options to the device. - CmdArgs.push_back("-flto"); for (auto &Arg : Args.filtered(OPT_offload_opt_eq_minus, OPT_mllvm)) CmdArgs.append( {"-Xlinker", Args.MakeArgString("--plugin-opt=" + StringRef(Arg->getValue()))}); + if (Triple.isNVPTX() || Triple.isAMDGPU()) { + CmdArgs.push_back("-foffload-lto"); + } else { + CmdArgs.push_back("-flto"); + } + if (!Triple.isNVPTX() && !Triple.isSPIRV()) CmdArgs.push_back("-Wl,--no-undefined"); diff --git a/sycl/test-e2e/NewOffloadDriver/multisource.cpp b/sycl/test-e2e/NewOffloadDriver/multisource.cpp index 558fc1239dfc1..5371eb549600b 100644 --- a/sycl/test-e2e/NewOffloadDriver/multisource.cpp +++ b/sycl/test-e2e/NewOffloadDriver/multisource.cpp @@ -5,8 +5,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// XFAIL: cuda -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16413 // Separate kernel sources and host code sources // Test with `--offload-new-driver` // RUN: %{build} --offload-new-driver -c -o %t.kernel.o -DINIT_KERNEL -DCALC_KERNEL diff --git a/sycl/test-e2e/NewOffloadDriver/split-per-source-main.cpp b/sycl/test-e2e/NewOffloadDriver/split-per-source-main.cpp index 70ba4c2cec29e..fc08e94aee467 100644 --- a/sycl/test-e2e/NewOffloadDriver/split-per-source-main.cpp +++ b/sycl/test-e2e/NewOffloadDriver/split-per-source-main.cpp @@ -1,9 +1,6 @@ // RUN: %{build} -Wno-error=unused-command-line-argument -fsycl-device-code-split=per_source -I %S/Inputs -o %t.out %S/Inputs/split-per-source-second-file.cpp \ // RUN: --offload-new-driver -fsycl-dead-args-optimization // RUN: %{run} %t.out -// -// XFAIL: cuda -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16413 #include "Inputs/split-per-source.h" diff --git a/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp b/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp index 67be57e411805..2ab17a9b3d3d9 100644 --- a/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp +++ b/sycl/test-e2e/NewOffloadDriver/sycl-external-with-optional-features.cpp @@ -3,8 +3,6 @@ // RUN: %{build} -DSOURCE2 --offload-new-driver -c -o %t2.o // RUN: %clangxx -Wno-error=unused-command-line-argument -fsycl %{sycl_target_opts} --offload-new-driver %t1.o %t2.o -o %t.exe // RUN: %{run} %t.exe -// XFAIL: cuda -// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16413 // XFAIL: spirv-backend // XFAIL-TRACKER: CMPLRLLVM-64059 From 70f92a0be0c04c364d1babd4c0ba28b805fb2e38 Mon Sep 17 00:00:00 2001 From: JackAKirk Date: Wed, 29 Jan 2025 10:55:44 +0000 Subject: [PATCH 45/45] [compat] Remove usage of __SYCL_COMPILER_VERSION (#16812) There should be no logic based on the compiler version within the actual compiler: it can only have one version. Note: does not take into account time-travel :) Signed-off-by: JackAKirk --- sycl/include/syclcompat/device.hpp | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/sycl/include/syclcompat/device.hpp b/sycl/include/syclcompat/device.hpp index 616165121b9ed..d31fb4ef1650a 100644 --- a/sycl/include/syclcompat/device.hpp +++ b/sycl/include/syclcompat/device.hpp @@ -450,7 +450,6 @@ class device_ext : public sycl::device { /// \param [out] total_memory The number of bytes of total memory on the SYCL /// device. void get_memory_info(size_t &free_memory, size_t &total_memory) const { -#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105) if (!has(sycl::aspect::ext_intel_free_memory)) { std::cerr << "[SYCLCompat] get_memory_info: ext_intel_free_memory is not " "supported." @@ -459,17 +458,6 @@ class device_ext : public sycl::device { } else { free_memory = get_info(); } -#else - std::cerr << "[SYCLCompat] get_memory_info: ext_intel_free_memory is not " - "supported." - << std::endl; - free_memory = 0; -#if defined(_MSC_VER) && !defined(__clang__) -#pragma message("Querying the number of bytes of free memory is not supported") -#else -#warning "Querying the number of bytes of free memory is not supported" -#endif -#endif total_memory = get_device_info().get_global_mem_size(); } @@ -489,15 +477,10 @@ class device_ext : public sycl::device { prop.set_minor_version(minor); prop.set_max_work_item_sizes( -#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902) - // oneAPI DPC++ compiler older than 2022/09/02, where - // max_work_item_sizes is an enum class element - get_info()); -#else // SYCL 2020-conformant code, max_work_item_sizes is a struct // templated by an int get_info>()); -#endif + prop.set_host_unified_memory(has(sycl::aspect::usm_host_allocations)); prop.set_max_clock_frequency(