From ad60255b0871043276a94d5d97c56770acf2dcdd Mon Sep 17 00:00:00 2001 From: Denis Yaroshevskiy Date: Thu, 26 Sep 2024 09:51:35 -0700 Subject: [PATCH] removing movemask from platform Summary: stepping stone for simd::contains. We want to do less things in simd platform if possbile, so moving out non essential things. Differential Revision: D63388617 --- CMakeLists.txt | 2 + folly/algorithm/simd/FindFixed.h | 2 +- folly/algorithm/simd/Movemask.h | 14 +++-- folly/algorithm/simd/detail/BUCK | 13 +++- folly/algorithm/simd/detail/Ignore.h | 61 +++++++++++++++++++ .../algorithm/simd/detail/SimdCharPlatform.h | 49 +++------------ folly/algorithm/simd/detail/SimdForEach.h | 18 +----- folly/algorithm/simd/detail/test/BUCK | 9 +++ .../algorithm/simd/detail/test/IgnoreTest.cpp | 45 ++++++++++++++ .../simd/detail/test/SimdAnyOfTest.cpp | 2 +- folly/algorithm/simd/test/MovemaskTest.cpp | 4 +- folly/detail/BUCK | 2 + folly/detail/SplitStringSimdImpl.h | 22 ++++--- 13 files changed, 164 insertions(+), 79 deletions(-) create mode 100644 folly/algorithm/simd/detail/Ignore.h create mode 100644 folly/algorithm/simd/detail/test/IgnoreTest.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index c30fc9938ed..c14c45d4b59 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -649,6 +649,8 @@ if (BUILD_TESTS OR BUILD_BENCHMARKS) DIRECTORY algorithm/simd/detail/test/ TEST algorithm_simd_detail_simd_any_of_test SOURCES SimdAnyOfTest.cpp TEST algorithm_simd_detail_simd_for_each_test SOURCES SimdForEachTest.cpp + TEST algorithm_simd_detail_simd_for_each_test SOURCES SimdForEachTest.cpp + TEST algorithm_simd_detail_ignore_test SOURCES IgnoreTest.cpp TEST algorithm_simd_detail_unroll_utils_test SOURCES UnrollUtilsTest.cpp # disabled until C++20 # TEST algorithm_simd_detail_simd_traits_test SOURCES TraitsTest.cpp diff --git a/folly/algorithm/simd/FindFixed.h b/folly/algorithm/simd/FindFixed.h index 547b02a6b3a..cb441710999 100644 --- a/folly/algorithm/simd/FindFixed.h +++ b/folly/algorithm/simd/FindFixed.h @@ -194,7 +194,7 @@ std::optional findSplitFirstRegister( template std::optional firstTrue(Reg reg) { - auto [bits, bitsPerElement] = folly::movemask(reg); + auto [bits, bitsPerElement] = folly::simd::movemask(reg); if (bits) { return std::countr_zero(bits) / bitsPerElement(); } diff --git a/folly/algorithm/simd/Movemask.h b/folly/algorithm/simd/Movemask.h index 1487304e82c..cc549860643 100644 --- a/folly/algorithm/simd/Movemask.h +++ b/folly/algorithm/simd/Movemask.h @@ -33,7 +33,7 @@ FOLLY_PUSH_WARNING FOLLY_GCC_DISABLE_WARNING("-Wignored-attributes") -namespace folly { +namespace folly::simd { /* * This is a low level utility used for simd search algorithms. @@ -43,7 +43,7 @@ namespace folly { * for both x86 and arm. * * Interface looks like this: - * folly::movemask<-scalar type->(nativeRegister) + * folly::simd::movemask<-scalar type->(nativeRegister) * -> std::pair; * * Bits - unsigned integral, containing the bitmask (first is lowest bit). @@ -53,7 +53,7 @@ namespace folly { * * std::optional firstTrueUint16(auto simdRegister) { * auto [bits, bitsPerElement] = - * folly::movemask(simdRegister); + * folly::simd::movemask(simdRegister); * if (!bits) { * return std::nullopt; * } @@ -71,7 +71,11 @@ template auto movemask(Reg reg) { std::integral_constant bitsPerElement; - auto mmask = static_cast([&] { + + using uint_t = std:: + conditional_t, std::uint16_t, std::uint32_t>; + + auto mmask = static_cast([&] { if constexpr (std::is_same_v) { if constexpr (sizeof(Scalar) <= 2) { return _mm_movemask_epi8(reg); @@ -142,6 +146,6 @@ auto movemask(Reg reg) { #endif -} // namespace folly +} // namespace folly::simd FOLLY_POP_WARNING diff --git a/folly/algorithm/simd/detail/BUCK b/folly/algorithm/simd/detail/BUCK index 53a172a2e6b..e6cb987737c 100644 --- a/folly/algorithm/simd/detail/BUCK +++ b/folly/algorithm/simd/detail/BUCK @@ -5,6 +5,14 @@ load("@fbcode_macros//build_defs:cpp_library.bzl", "cpp_library") oncall("fbcode_entropy_wardens_folly") +cpp_library( + name = "ignore", + headers = ["Ignore.h"], + exported_deps = [ + "//folly/lang:bits", + ], +) + cpp_library( name = "simd_any_of", headers = ["SimdAnyOf.h"], @@ -19,7 +27,7 @@ cpp_library( name = "simd_char_platform", headers = ["SimdCharPlatform.h"], exported_deps = [ - ":simd_for_each", + ":ignore", "//folly:portability", "//folly/algorithm/simd:movemask", "//folly/lang:bits", @@ -30,9 +38,10 @@ cpp_library( name = "simd_for_each", headers = ["SimdForEach.h"], exported_deps = [ + ":ignore", + ":unroll_utils", "//folly:c_portability", "//folly:traits", - "//folly/algorithm/simd/detail:unroll_utils", ], ) diff --git a/folly/algorithm/simd/detail/Ignore.h b/folly/algorithm/simd/detail/Ignore.h new file mode 100644 index 00000000000..4031cb5c6c9 --- /dev/null +++ b/folly/algorithm/simd/detail/Ignore.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace folly::simd::detail { + +/** + * ignore(_none/_extrema) + * + * Tag types for handling the tails. + * ignore_none indicates that the whole register is used. + * ignore_extrema.first, .last show how many elements are out of the data. + * + * For example 3 elements, starting from the second for an 8 element register + * will be ignore_extrema{.first = 1, .last = 4} + */ + +struct ignore_extrema { + int first = 0; + int last = 0; +}; + +struct ignore_none {}; + +/* + * NOTE: for ignore none we don't clear anything, even if some bits are not + * doing anything. We expect mmask to only have zeroes in masked out elements. + * + * Maybe we need to revisit that at some point. + */ +template +void mmaskClearIgnored(std::pair& mmask, Ignore ignore) { + if constexpr (std::is_same_v) { + mmask.first = set_rzero(mmask.first, ignore.first * BitsPerElement{}); + + static constexpr int kTopBitsAlwaysIgnored = + sizeof(Uint) * 8 - Cardinal * BitsPerElement{}; + mmask.first = set_lzero( + mmask.first, ignore.last * BitsPerElement{} + kTopBitsAlwaysIgnored); + } +} + +} // namespace folly::simd::detail diff --git a/folly/algorithm/simd/detail/SimdCharPlatform.h b/folly/algorithm/simd/detail/SimdCharPlatform.h index fd8097d072e..6fb58049f5b 100644 --- a/folly/algorithm/simd/detail/SimdCharPlatform.h +++ b/folly/algorithm/simd/detail/SimdCharPlatform.h @@ -18,7 +18,8 @@ #include #include -#include +#include +#include #include #include @@ -68,13 +69,9 @@ namespace simd::detail { * - le_unsigned(reg_t, char) - by lane less than or equal to char. * * logical ops: - * - movemask - take a bitmask * - any(logical_t, ignore) - return true if any the lanes are true * - logical_or(logical_t, logical_t) - by lane logical or * - * mmask ops: - * - clear(mmask, ignore) - sets ignored bits to 0 - * */ #if FOLLY_X64 || FOLLY_AARCH64 @@ -82,33 +79,6 @@ namespace simd::detail { template struct SimdCharPlatformCommon : Platform { using logical_t = typename Platform::logical_t; - using movemask_result_t = - decltype(folly::movemask(logical_t{})); - using mmask_t = typename movemask_result_t::first_type; - static constexpr std::uint32_t kMmaskBitsPerElement = - typename movemask_result_t::second_type{}(); - - template - FOLLY_NODISCARD FOLLY_ALWAYS_INLINE static Uint setLowerNBits(int n) { - if (sizeof(Uint) == 8 && n == 64) { - return static_cast(-1); - } - return static_cast((std::uint64_t{1} << n) - 1); - } - - FOLLY_NODISCARD FOLLY_ALWAYS_INLINE static mmask_t clear( - mmask_t mmask, ignore_extrema ignore) { - mmask_t clearFirst = - ~setLowerNBits(ignore.first * kMmaskBitsPerElement); - mmask_t clearLast = setLowerNBits( - (Platform::kCardinal - ignore.last) * kMmaskBitsPerElement); - return mmask & clearFirst & clearLast; - } - - FOLLY_NODISCARD FOLLY_ALWAYS_INLINE static mmask_t clear( - mmask_t mmask, ignore_none) { - return mmask; - } // These are aligned loads but there is no point in generating // aligned load instructions, so we call loadu. @@ -122,18 +92,13 @@ struct SimdCharPlatformCommon : Platform { return Platform::unsafeLoadu(ptr, ignore_none{}); } - FOLLY_ALWAYS_INLINE - static mmask_t movemask(logical_t log) { - return folly::movemask(log).first; - } - using Platform::any; FOLLY_ALWAYS_INLINE static bool any(typename Platform::logical_t log, ignore_extrema ignore) { - auto mmask = movemask(log); - mmask = clear(mmask, ignore); - return mmask; + auto mmask = movemask(log); + mmaskClearIgnored(mmask, ignore); + return mmask.first; } static auto toArray(typename Platform::reg_t x) { @@ -186,7 +151,7 @@ struct SimdCharSse2PlatformSpecific { FOLLY_ALWAYS_INLINE static bool any(logical_t log, ignore_none) { - return folly::movemask(log).first; + return movemask(log).first; } }; @@ -234,7 +199,7 @@ struct SimdCharAvx2PlatformSpecific { FOLLY_ALWAYS_INLINE static bool any(logical_t log, ignore_none) { - return folly::movemask(log).first; + return simd::movemask(log).first; } }; diff --git a/folly/algorithm/simd/detail/SimdForEach.h b/folly/algorithm/simd/detail/SimdForEach.h index fe68daf6138..e10217e39b8 100644 --- a/folly/algorithm/simd/detail/SimdForEach.h +++ b/folly/algorithm/simd/detail/SimdForEach.h @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -35,23 +36,6 @@ namespace simd::detail { // to mess that up. // -/** - * ignore(_none/_extrema) - * - * Tag types for handling the tails. - * ignore_none indicates that the whole register is used. - * ignore_extrema.first, .last show how many elements are out of the data. - * - * For example 3 elements, starting from the second for an 8 element register - * will be ignore_extrema{.first = 1, .last = 4} - */ -struct ignore_extrema { - int first = 0; - int last = 0; -}; - -struct ignore_none {}; - /** * simdForEachAligning(cardinal, f, l, delegate); * diff --git a/folly/algorithm/simd/detail/test/BUCK b/folly/algorithm/simd/detail/test/BUCK index 6661c2ce012..50a3570b819 100644 --- a/folly/algorithm/simd/detail/test/BUCK +++ b/folly/algorithm/simd/detail/test/BUCK @@ -24,6 +24,15 @@ cpp_unittest( ], ) +cpp_unittest( + name = "ignore_test", + srcs = ["IgnoreTest.cpp"], + deps = [ + "//folly/algorithm/simd/detail:ignore", + "//folly/portability:gtest", + ], +) + cpp_unittest( name = "traits_test", srcs = ["TraitsTest.cpp"], diff --git a/folly/algorithm/simd/detail/test/IgnoreTest.cpp b/folly/algorithm/simd/detail/test/IgnoreTest.cpp new file mode 100644 index 00000000000..61e41e7370b --- /dev/null +++ b/folly/algorithm/simd/detail/test/IgnoreTest.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +namespace folly::simd::detail { + +struct IgnoreTest : ::testing::Test {}; + +TEST_F(IgnoreTest, MaskClearIgnored) { + auto mmask = + std::pair{std::uint8_t{0xff}, std::integral_constant{}}; + + // mostly relying on folly::clear_<>_n_bits working correctly + // simd any of also covers a lot of cases. + // this is just the bare minimal smoke test. + + mmaskClearIgnored<4>(mmask, ignore_none{}); + EXPECT_EQ(0xff, mmask.first); + + mmaskClearIgnored<4>(mmask, ignore_extrema{1, 2}); + EXPECT_EQ(0b0000'1100, mmask.first); + + mmaskClearIgnored<2>(mmask, ignore_extrema{0, 1}); + EXPECT_EQ(0b0000'0000, mmask.first); +} + +} // namespace folly::simd::detail diff --git a/folly/algorithm/simd/detail/test/SimdAnyOfTest.cpp b/folly/algorithm/simd/detail/test/SimdAnyOfTest.cpp index f08bc29aa6d..8ea006f0eae 100644 --- a/folly/algorithm/simd/detail/test/SimdAnyOfTest.cpp +++ b/folly/algorithm/simd/detail/test/SimdAnyOfTest.cpp @@ -83,7 +83,7 @@ TEST(SimdAnyOfSimple, Ignore) { buffer.fill(' '); for (auto& c : buffer) { c = 'a'; - anySpacesTest({&c, 1}, false); + ASSERT_NO_FATAL_FAILURE(anySpacesTest({&c, 1}, false)); c = ' '; } } diff --git a/folly/algorithm/simd/test/MovemaskTest.cpp b/folly/algorithm/simd/test/MovemaskTest.cpp index b97fb13630b..50616da5b23 100644 --- a/folly/algorithm/simd/test/MovemaskTest.cpp +++ b/folly/algorithm/simd/test/MovemaskTest.cpp @@ -55,11 +55,11 @@ void allOneTrueTests() { std::array arr; arr.fill(kFalse); - ASSERT_EQ(0, folly::movemask(loadReg(arr)).first); + ASSERT_EQ(0, folly::simd::movemask(loadReg(arr)).first); for (std::size_t i = 0; i != N; ++i) { arr[i] = kTrue; - auto [bits, bitsPerElement] = folly::movemask(loadReg(arr)); + auto [bits, bitsPerElement] = folly::simd::movemask(loadReg(arr)); std::uint64_t oneElement = safeShift(1, bitsPerElement()) - 1; std::uint64_t expectedBits = safeShift(oneElement, i * bitsPerElement()); diff --git a/folly/detail/BUCK b/folly/detail/BUCK index c23b20d0575..ff55fab3a9c 100644 --- a/folly/detail/BUCK +++ b/folly/detail/BUCK @@ -269,6 +269,8 @@ cpp_library( exported_deps = [ "//folly:portability", "//folly:range", + "//folly/algorithm/simd:movemask", + "//folly/algorithm/simd/detail:ignore", "//folly/algorithm/simd/detail:simd_char_platform", "//folly/algorithm/simd/detail:simd_for_each", "//folly/lang:bits", diff --git a/folly/detail/SplitStringSimdImpl.h b/folly/detail/SplitStringSimdImpl.h index aa3ed18e653..d37a6bd17a3 100644 --- a/folly/detail/SplitStringSimdImpl.h +++ b/folly/detail/SplitStringSimdImpl.h @@ -18,6 +18,8 @@ #include #include +#include +#include #include #include #include @@ -78,17 +80,19 @@ struct PlatformSimdSplitByChar { res.emplace_back(f, l - f); } - template + template FOLLY_ALWAYS_INLINE void outputStringsFoMmask( - Uint mmask, + std::pair mmask, const char* pos, const char*& prev, Container& res) const { // reserve was not beneficial on benchmarks. - while (mmask) { - auto counted = folly::findFirstSet(mmask) - 1; - mmask >>= counted; - mmask >>= Platform::kMmaskBitsPerElement; - auto firstSet = counted / Platform::kMmaskBitsPerElement; + + Uint mmaskBits = mmask.first; + while (mmaskBits) { + auto counted = folly::findFirstSet(mmaskBits) - 1; + mmaskBits >>= counted; + mmaskBits >>= BitsPerElement{}; + auto firstSet = counted / BitsPerElement{}; const char* split = pos + firstSet; pos = split + 1; @@ -108,8 +112,8 @@ struct PlatformSimdSplitByChar { FOLLY_ALWAYS_INLINE bool step( const char* ptr, Ignore ignore, UnrollIndex) const { reg_t loaded = Platform::loada(ptr, ignore); - auto mmask = Platform::movemask(Platform::equal(loaded, sep)); - mmask = Platform::clear(mmask, ignore); + auto mmask = simd::movemask(Platform::equal(loaded, sep)); + simd::detail::mmaskClearIgnored(mmask, ignore); self.outputStringsFoMmask(mmask, ptr, prev, res); return false; }