From 6534107d3e51929663151bd66c026cbd9fbb68ea Mon Sep 17 00:00:00 2001 From: Sainan Date: Wed, 14 Aug 2024 12:21:37 +0200 Subject: [PATCH 1/3] Don't update soup/base.hpp --- src/vendor/Soup/soup/_update.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/vendor/Soup/soup/_update.php b/src/vendor/Soup/soup/_update.php index 261e533389..3fed00b26c 100644 --- a/src/vendor/Soup/soup/_update.php +++ b/src/vendor/Soup/soup/_update.php @@ -17,7 +17,7 @@ $dir .= "/Soup/soup"; foreach (scandir(".") as $f) { - if (substr($f, -4) == ".cpp" || substr($f, -4) == ".hpp") + if ($f != "base.hpp" && (substr($f, -4) == ".cpp" || substr($f, -4) == ".hpp")) { copy("$dir/$f", $f); } From 71aa41b32c99805bc9a22e3d32b6eca7df8490ac Mon Sep 17 00:00:00 2001 From: Sainan Date: Mon, 12 Aug 2024 17:41:27 +0200 Subject: [PATCH 2/3] Update Soup --- src/lhttplib.cpp | 2 +- src/vendor/Soup/Intrin/aes_helper.cpp | 819 ++++++++++---------- src/vendor/Soup/Intrin/crc32_intrin.cpp | 79 +- src/vendor/Soup/Intrin/hardware_rng.cpp | 76 +- src/vendor/Soup/Intrin/sha1_transform.cpp | 707 ++++++++--------- src/vendor/Soup/Intrin/sha256_transform.cpp | 743 +++++++++--------- src/vendor/Soup/soup/Buffer.hpp | 10 +- src/vendor/Soup/soup/CryptoHashAlgo.hpp | 89 ++- src/vendor/Soup/soup/HardwareRng.cpp | 24 +- src/vendor/Soup/soup/HttpRequest.cpp | 2 +- src/vendor/Soup/soup/HttpRequestTask.cpp | 22 +- src/vendor/Soup/soup/HttpRequestTask.hpp | 3 +- src/vendor/Soup/soup/Reader.cpp | 44 +- src/vendor/Soup/soup/Reader.hpp | 3 +- src/vendor/Soup/soup/Socket.cpp | 96 ++- src/vendor/Soup/soup/Socket.hpp | 10 +- src/vendor/Soup/soup/SocketTlsEncrypter.cpp | 56 +- src/vendor/Soup/soup/SocketTlsEncrypter.hpp | 9 +- src/vendor/Soup/soup/Uri.cpp | 79 +- src/vendor/Soup/soup/Writer.cpp | 51 +- src/vendor/Soup/soup/Writer.hpp | 3 +- src/vendor/Soup/soup/aes.cpp | 90 +-- src/vendor/Soup/soup/crc32.cpp | 18 +- src/vendor/Soup/soup/deflate.cpp | 9 +- src/vendor/Soup/soup/deflate.hpp | 1 + src/vendor/Soup/soup/filesystem.cpp | 9 + src/vendor/Soup/soup/filesystem.hpp | 2 + src/vendor/Soup/soup/fwd.hpp | 4 +- src/vendor/Soup/soup/netStatus.cpp | 22 +- src/vendor/Soup/soup/netStatus.hpp | 1 + src/vendor/Soup/soup/os.cpp | 6 +- src/vendor/Soup/soup/os.hpp | 7 + src/vendor/Soup/soup/sha1.cpp | 197 ++--- src/vendor/Soup/soup/sha1.hpp | 37 + src/vendor/Soup/soup/sha256.cpp | 160 ++-- src/vendor/Soup/soup/sha256.hpp | 36 + src/vendor/Soup/soup/sha384.cpp | 82 +- src/vendor/Soup/soup/sha384.hpp | 18 +- src/vendor/Soup/soup/sha512.cpp | 124 +-- src/vendor/Soup/soup/sha512.hpp | 46 +- src/vendor/Soup/soup/string.cpp | 2 + 41 files changed, 1936 insertions(+), 1862 deletions(-) diff --git a/src/lhttplib.cpp b/src/lhttplib.cpp index 5aafc504e9..4c1f6ba305 100644 --- a/src/lhttplib.cpp +++ b/src/lhttplib.cpp @@ -30,7 +30,7 @@ static int push_http_response (lua_State *L, soup::HttpRequestTask& task) { #if SOUP_WASM return 1; /* specialized HttpRequestTask for WASM doesn't have `getStatus` */ #else - lua_pushstring(L, soup::netStatusToString(task.getStatus())); + pluto_pushstring(L, task.getStatus()); return 2; #endif } diff --git a/src/vendor/Soup/Intrin/aes_helper.cpp b/src/vendor/Soup/Intrin/aes_helper.cpp index 01a5d5cacb..83973632e8 100644 --- a/src/vendor/Soup/Intrin/aes_helper.cpp +++ b/src/vendor/Soup/Intrin/aes_helper.cpp @@ -1,8 +1,10 @@ +#include "../soup/base.hpp" + #include -#if defined(__x86_64__) || defined(_M_X64) +#if SOUP_X86 #include -#elif defined(__aarch64__) || defined(_M_ARM64) +#elif SOUP_ARM #include #endif @@ -12,434 +14,437 @@ // ARM: // - https://blog.michaelbrase.com/2018/06/04/optimizing-x86-aes-intrinsics-on-armv8-a/ -namespace soup_intrin +NAMESPACE_SOUP { -#if defined(__x86_64__) || defined(_M_X64) - [[nodiscard]] static __m128i aes_expand_key_step(__m128i key0, __m128i key1) noexcept + namespace intrin { - key0 = _mm_xor_si128(key0, _mm_slli_si128(key0, 4)); - key0 = _mm_xor_si128(key0, _mm_slli_si128(key0, 4)); - key0 = _mm_xor_si128(key0, _mm_slli_si128(key0, 4)); - return _mm_xor_si128(key0, _mm_shuffle_epi32(key1, 0xff)); - } +#if SOUP_X86 + [[nodiscard]] static __m128i aes_expand_key_step(__m128i key0, __m128i key1) noexcept + { + key0 = _mm_xor_si128(key0, _mm_slli_si128(key0, 4)); + key0 = _mm_xor_si128(key0, _mm_slli_si128(key0, 4)); + key0 = _mm_xor_si128(key0, _mm_slli_si128(key0, 4)); + return _mm_xor_si128(key0, _mm_shuffle_epi32(key1, 0xff)); + } - [[nodiscard]] static __m128i aes_expand_key_odd_step(__m128i key0, __m128i key1) noexcept - { - key0 = _mm_aeskeygenassist_si128(key0, 0); - key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); - key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); - key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); - return _mm_xor_si128(key1, _mm_shuffle_epi32(key0, 0xaa)); - } + [[nodiscard]] static __m128i aes_expand_key_odd_step(__m128i key0, __m128i key1) noexcept + { + key0 = _mm_aeskeygenassist_si128(key0, 0); + key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); + key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); + key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); + return _mm_xor_si128(key1, _mm_shuffle_epi32(key0, 0xaa)); + } - void aes_expand_key_128(uint8_t w[176], const uint8_t key[16]) noexcept - { - reinterpret_cast<__m128i*>(w)[0] = _mm_loadu_si128(reinterpret_cast(key)); - reinterpret_cast<__m128i*>(w)[1] = aes_expand_key_step(reinterpret_cast(w)[0], _mm_aeskeygenassist_si128(reinterpret_cast(w)[0], 0x01)); - reinterpret_cast<__m128i*>(w)[2] = aes_expand_key_step(reinterpret_cast(w)[1], _mm_aeskeygenassist_si128(reinterpret_cast(w)[1], 0x02)); - reinterpret_cast<__m128i*>(w)[3] = aes_expand_key_step(reinterpret_cast(w)[2], _mm_aeskeygenassist_si128(reinterpret_cast(w)[2], 0x04)); - reinterpret_cast<__m128i*>(w)[4] = aes_expand_key_step(reinterpret_cast(w)[3], _mm_aeskeygenassist_si128(reinterpret_cast(w)[3], 0x08)); - reinterpret_cast<__m128i*>(w)[5] = aes_expand_key_step(reinterpret_cast(w)[4], _mm_aeskeygenassist_si128(reinterpret_cast(w)[4], 0x10)); - reinterpret_cast<__m128i*>(w)[6] = aes_expand_key_step(reinterpret_cast(w)[5], _mm_aeskeygenassist_si128(reinterpret_cast(w)[5], 0x20)); - reinterpret_cast<__m128i*>(w)[7] = aes_expand_key_step(reinterpret_cast(w)[6], _mm_aeskeygenassist_si128(reinterpret_cast(w)[6], 0x40)); - reinterpret_cast<__m128i*>(w)[8] = aes_expand_key_step(reinterpret_cast(w)[7], _mm_aeskeygenassist_si128(reinterpret_cast(w)[7], 0x80)); - reinterpret_cast<__m128i*>(w)[9] = aes_expand_key_step(reinterpret_cast(w)[8], _mm_aeskeygenassist_si128(reinterpret_cast(w)[8], 0x1B)); - reinterpret_cast<__m128i*>(w)[10] = aes_expand_key_step(reinterpret_cast(w)[9], _mm_aeskeygenassist_si128(reinterpret_cast(w)[9], 0x36)); - } + void aes_expand_key_128(uint8_t w[176], const uint8_t key[16]) noexcept + { + reinterpret_cast<__m128i*>(w)[0] = _mm_loadu_si128(reinterpret_cast(key)); + reinterpret_cast<__m128i*>(w)[1] = aes_expand_key_step(reinterpret_cast(w)[0], _mm_aeskeygenassist_si128(reinterpret_cast(w)[0], 0x01)); + reinterpret_cast<__m128i*>(w)[2] = aes_expand_key_step(reinterpret_cast(w)[1], _mm_aeskeygenassist_si128(reinterpret_cast(w)[1], 0x02)); + reinterpret_cast<__m128i*>(w)[3] = aes_expand_key_step(reinterpret_cast(w)[2], _mm_aeskeygenassist_si128(reinterpret_cast(w)[2], 0x04)); + reinterpret_cast<__m128i*>(w)[4] = aes_expand_key_step(reinterpret_cast(w)[3], _mm_aeskeygenassist_si128(reinterpret_cast(w)[3], 0x08)); + reinterpret_cast<__m128i*>(w)[5] = aes_expand_key_step(reinterpret_cast(w)[4], _mm_aeskeygenassist_si128(reinterpret_cast(w)[4], 0x10)); + reinterpret_cast<__m128i*>(w)[6] = aes_expand_key_step(reinterpret_cast(w)[5], _mm_aeskeygenassist_si128(reinterpret_cast(w)[5], 0x20)); + reinterpret_cast<__m128i*>(w)[7] = aes_expand_key_step(reinterpret_cast(w)[6], _mm_aeskeygenassist_si128(reinterpret_cast(w)[6], 0x40)); + reinterpret_cast<__m128i*>(w)[8] = aes_expand_key_step(reinterpret_cast(w)[7], _mm_aeskeygenassist_si128(reinterpret_cast(w)[7], 0x80)); + reinterpret_cast<__m128i*>(w)[9] = aes_expand_key_step(reinterpret_cast(w)[8], _mm_aeskeygenassist_si128(reinterpret_cast(w)[8], 0x1B)); + reinterpret_cast<__m128i*>(w)[10] = aes_expand_key_step(reinterpret_cast(w)[9], _mm_aeskeygenassist_si128(reinterpret_cast(w)[9], 0x36)); + } - static void KEY_192_ASSIST(__m128i* temp1, __m128i* temp2, __m128i* temp3) noexcept - { - __m128i temp4; - *temp2 = _mm_shuffle_epi32(*temp2, 0x55); - temp4 = _mm_slli_si128(*temp1, 0x4); - *temp1 = _mm_xor_si128(*temp1, temp4); - temp4 = _mm_slli_si128(temp4, 0x4); - *temp1 = _mm_xor_si128(*temp1, temp4); - temp4 = _mm_slli_si128(temp4, 0x4); - *temp1 = _mm_xor_si128(*temp1, temp4); - *temp1 = _mm_xor_si128(*temp1, *temp2); - *temp2 = _mm_shuffle_epi32(*temp1, 0xff); - temp4 = _mm_slli_si128(*temp3, 0x4); - *temp3 = _mm_xor_si128(*temp3, temp4); - *temp3 = _mm_xor_si128(*temp3, *temp2); - } + static void KEY_192_ASSIST(__m128i* temp1, __m128i* temp2, __m128i* temp3) noexcept + { + __m128i temp4; + *temp2 = _mm_shuffle_epi32(*temp2, 0x55); + temp4 = _mm_slli_si128(*temp1, 0x4); + *temp1 = _mm_xor_si128(*temp1, temp4); + temp4 = _mm_slli_si128(temp4, 0x4); + *temp1 = _mm_xor_si128(*temp1, temp4); + temp4 = _mm_slli_si128(temp4, 0x4); + *temp1 = _mm_xor_si128(*temp1, temp4); + *temp1 = _mm_xor_si128(*temp1, *temp2); + *temp2 = _mm_shuffle_epi32(*temp1, 0xff); + temp4 = _mm_slli_si128(*temp3, 0x4); + *temp3 = _mm_xor_si128(*temp3, temp4); + *temp3 = _mm_xor_si128(*temp3, *temp2); + } - void aes_expand_key_192(uint8_t w[208], const uint8_t key[24]) noexcept - { - __m128i temp1, temp2, temp3; - __m128i* Key_Schedule = (__m128i*)w; - temp1 = _mm_loadu_si128((__m128i*)key); - temp3 = _mm_set_epi64x(0, *reinterpret_cast(key + 16)); - Key_Schedule[0] = temp1; - Key_Schedule[1] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x1); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[1] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(Key_Schedule[1]), _mm_castsi128_pd(temp1), 0)); - Key_Schedule[2] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(temp1), _mm_castsi128_pd(temp3), 1)); - temp2 = _mm_aeskeygenassist_si128(temp3, 0x2); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[3] = temp1; - Key_Schedule[4] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x4); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[4] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(Key_Schedule[4]), _mm_castsi128_pd(temp1), 0)); - Key_Schedule[5] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(temp1), _mm_castsi128_pd(temp3), 1)); - temp2 = _mm_aeskeygenassist_si128(temp3, 0x8); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[6] = temp1; - Key_Schedule[7] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x10); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[7] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(Key_Schedule[7]), _mm_castsi128_pd(temp1), 0)); - Key_Schedule[8] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(temp1), _mm_castsi128_pd(temp3), 1)); - temp2 = _mm_aeskeygenassist_si128(temp3, 0x20); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[9] = temp1; - Key_Schedule[10] = temp3; - temp2 = _mm_aeskeygenassist_si128(temp3, 0x40); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[10] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(Key_Schedule[10]), _mm_castsi128_pd(temp1), 0)); - Key_Schedule[11] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(temp1), _mm_castsi128_pd(temp3), 1)); - temp2 = _mm_aeskeygenassist_si128(temp3, 0x80); - KEY_192_ASSIST(&temp1, &temp2, &temp3); - Key_Schedule[12] = temp1; - } + void aes_expand_key_192(uint8_t w[208], const uint8_t key[24]) noexcept + { + __m128i temp1, temp2, temp3; + __m128i* Key_Schedule = (__m128i*)w; + temp1 = _mm_loadu_si128((__m128i*)key); + temp3 = _mm_set_epi64x(0, *reinterpret_cast(key + 16)); + Key_Schedule[0] = temp1; + Key_Schedule[1] = temp3; + temp2 = _mm_aeskeygenassist_si128(temp3, 0x1); + KEY_192_ASSIST(&temp1, &temp2, &temp3); + Key_Schedule[1] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(Key_Schedule[1]), _mm_castsi128_pd(temp1), 0)); + Key_Schedule[2] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(temp1), _mm_castsi128_pd(temp3), 1)); + temp2 = _mm_aeskeygenassist_si128(temp3, 0x2); + KEY_192_ASSIST(&temp1, &temp2, &temp3); + Key_Schedule[3] = temp1; + Key_Schedule[4] = temp3; + temp2 = _mm_aeskeygenassist_si128(temp3, 0x4); + KEY_192_ASSIST(&temp1, &temp2, &temp3); + Key_Schedule[4] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(Key_Schedule[4]), _mm_castsi128_pd(temp1), 0)); + Key_Schedule[5] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(temp1), _mm_castsi128_pd(temp3), 1)); + temp2 = _mm_aeskeygenassist_si128(temp3, 0x8); + KEY_192_ASSIST(&temp1, &temp2, &temp3); + Key_Schedule[6] = temp1; + Key_Schedule[7] = temp3; + temp2 = _mm_aeskeygenassist_si128(temp3, 0x10); + KEY_192_ASSIST(&temp1, &temp2, &temp3); + Key_Schedule[7] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(Key_Schedule[7]), _mm_castsi128_pd(temp1), 0)); + Key_Schedule[8] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(temp1), _mm_castsi128_pd(temp3), 1)); + temp2 = _mm_aeskeygenassist_si128(temp3, 0x20); + KEY_192_ASSIST(&temp1, &temp2, &temp3); + Key_Schedule[9] = temp1; + Key_Schedule[10] = temp3; + temp2 = _mm_aeskeygenassist_si128(temp3, 0x40); + KEY_192_ASSIST(&temp1, &temp2, &temp3); + Key_Schedule[10] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(Key_Schedule[10]), _mm_castsi128_pd(temp1), 0)); + Key_Schedule[11] = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(temp1), _mm_castsi128_pd(temp3), 1)); + temp2 = _mm_aeskeygenassist_si128(temp3, 0x80); + KEY_192_ASSIST(&temp1, &temp2, &temp3); + Key_Schedule[12] = temp1; + } - void aes_expand_key_256(uint8_t w[240], const uint8_t key[32]) noexcept - { - reinterpret_cast<__m128i*>(w)[0] = _mm_loadu_si128(&reinterpret_cast(key)[0]); - reinterpret_cast<__m128i*>(w)[1] = _mm_loadu_si128(&reinterpret_cast(key)[1]); - reinterpret_cast<__m128i*>(w)[2] = aes_expand_key_step(reinterpret_cast(w)[0], _mm_aeskeygenassist_si128(reinterpret_cast(w)[1], 0x01)); - reinterpret_cast<__m128i*>(w)[3] = aes_expand_key_odd_step(reinterpret_cast(w)[2], reinterpret_cast<__m128i*>(w)[1]); - reinterpret_cast<__m128i*>(w)[4] = aes_expand_key_step(reinterpret_cast(w)[2], _mm_aeskeygenassist_si128(reinterpret_cast(w)[3], 0x02)); - reinterpret_cast<__m128i*>(w)[5] = aes_expand_key_odd_step(reinterpret_cast(w)[4], reinterpret_cast<__m128i*>(w)[3]); - reinterpret_cast<__m128i*>(w)[6] = aes_expand_key_step(reinterpret_cast(w)[4], _mm_aeskeygenassist_si128(reinterpret_cast(w)[5], 0x04)); - reinterpret_cast<__m128i*>(w)[7] = aes_expand_key_odd_step(reinterpret_cast(w)[6], reinterpret_cast<__m128i*>(w)[5]); - reinterpret_cast<__m128i*>(w)[8] = aes_expand_key_step(reinterpret_cast(w)[6], _mm_aeskeygenassist_si128(reinterpret_cast(w)[7], 0x08)); - reinterpret_cast<__m128i*>(w)[9] = aes_expand_key_odd_step(reinterpret_cast(w)[8], reinterpret_cast<__m128i*>(w)[7]); - reinterpret_cast<__m128i*>(w)[10] = aes_expand_key_step(reinterpret_cast(w)[8], _mm_aeskeygenassist_si128(reinterpret_cast(w)[9], 0x10)); - reinterpret_cast<__m128i*>(w)[11] = aes_expand_key_odd_step(reinterpret_cast(w)[10], reinterpret_cast<__m128i*>(w)[9]); - reinterpret_cast<__m128i*>(w)[12] = aes_expand_key_step(reinterpret_cast(w)[10], _mm_aeskeygenassist_si128(reinterpret_cast(w)[11], 0x20)); - reinterpret_cast<__m128i*>(w)[13] = aes_expand_key_odd_step(reinterpret_cast(w)[12], reinterpret_cast<__m128i*>(w)[11]); - reinterpret_cast<__m128i*>(w)[14] = aes_expand_key_step(reinterpret_cast(w)[12], _mm_aeskeygenassist_si128(reinterpret_cast(w)[13], 0x40)); - } + void aes_expand_key_256(uint8_t w[240], const uint8_t key[32]) noexcept + { + reinterpret_cast<__m128i*>(w)[0] = _mm_loadu_si128(&reinterpret_cast(key)[0]); + reinterpret_cast<__m128i*>(w)[1] = _mm_loadu_si128(&reinterpret_cast(key)[1]); + reinterpret_cast<__m128i*>(w)[2] = aes_expand_key_step(reinterpret_cast(w)[0], _mm_aeskeygenassist_si128(reinterpret_cast(w)[1], 0x01)); + reinterpret_cast<__m128i*>(w)[3] = aes_expand_key_odd_step(reinterpret_cast(w)[2], reinterpret_cast<__m128i*>(w)[1]); + reinterpret_cast<__m128i*>(w)[4] = aes_expand_key_step(reinterpret_cast(w)[2], _mm_aeskeygenassist_si128(reinterpret_cast(w)[3], 0x02)); + reinterpret_cast<__m128i*>(w)[5] = aes_expand_key_odd_step(reinterpret_cast(w)[4], reinterpret_cast<__m128i*>(w)[3]); + reinterpret_cast<__m128i*>(w)[6] = aes_expand_key_step(reinterpret_cast(w)[4], _mm_aeskeygenassist_si128(reinterpret_cast(w)[5], 0x04)); + reinterpret_cast<__m128i*>(w)[7] = aes_expand_key_odd_step(reinterpret_cast(w)[6], reinterpret_cast<__m128i*>(w)[5]); + reinterpret_cast<__m128i*>(w)[8] = aes_expand_key_step(reinterpret_cast(w)[6], _mm_aeskeygenassist_si128(reinterpret_cast(w)[7], 0x08)); + reinterpret_cast<__m128i*>(w)[9] = aes_expand_key_odd_step(reinterpret_cast(w)[8], reinterpret_cast<__m128i*>(w)[7]); + reinterpret_cast<__m128i*>(w)[10] = aes_expand_key_step(reinterpret_cast(w)[8], _mm_aeskeygenassist_si128(reinterpret_cast(w)[9], 0x10)); + reinterpret_cast<__m128i*>(w)[11] = aes_expand_key_odd_step(reinterpret_cast(w)[10], reinterpret_cast<__m128i*>(w)[9]); + reinterpret_cast<__m128i*>(w)[12] = aes_expand_key_step(reinterpret_cast(w)[10], _mm_aeskeygenassist_si128(reinterpret_cast(w)[11], 0x20)); + reinterpret_cast<__m128i*>(w)[13] = aes_expand_key_odd_step(reinterpret_cast(w)[12], reinterpret_cast<__m128i*>(w)[11]); + reinterpret_cast<__m128i*>(w)[14] = aes_expand_key_step(reinterpret_cast(w)[12], _mm_aeskeygenassist_si128(reinterpret_cast(w)[13], 0x40)); + } - void aes_encrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept - { - __m128i data = *reinterpret_cast(in); - data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[0]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[1]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[2]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[3]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[4]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[5]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[6]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[7]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[8]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[9]); - data = _mm_aesenclast_si128(data, reinterpret_cast(roundKeys)[10]); - *reinterpret_cast<__m128i*>(out) = data; - } + void aes_encrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept + { + __m128i data = *reinterpret_cast(in); + data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[0]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[1]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[2]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[3]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[4]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[5]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[6]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[7]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[8]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[9]); + data = _mm_aesenclast_si128(data, reinterpret_cast(roundKeys)[10]); + *reinterpret_cast<__m128i*>(out) = data; + } - void aes_encrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept - { - __m128i data = *reinterpret_cast(in); - data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[0]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[1]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[2]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[3]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[4]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[5]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[6]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[7]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[8]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[9]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[10]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[11]); - data = _mm_aesenclast_si128(data, reinterpret_cast(roundKeys)[12]); - *reinterpret_cast<__m128i*>(out) = data; - } + void aes_encrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept + { + __m128i data = *reinterpret_cast(in); + data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[0]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[1]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[2]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[3]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[4]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[5]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[6]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[7]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[8]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[9]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[10]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[11]); + data = _mm_aesenclast_si128(data, reinterpret_cast(roundKeys)[12]); + *reinterpret_cast<__m128i*>(out) = data; + } - void aes_encrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept - { - __m128i data = *reinterpret_cast(in); - data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[0]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[1]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[2]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[3]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[4]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[5]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[6]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[7]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[8]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[9]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[10]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[11]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[12]); - data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[13]); - data = _mm_aesenclast_si128(data, reinterpret_cast(roundKeys)[14]); - *reinterpret_cast<__m128i*>(out) = data; - } + void aes_encrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept + { + __m128i data = *reinterpret_cast(in); + data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[0]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[1]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[2]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[3]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[4]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[5]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[6]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[7]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[8]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[9]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[10]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[11]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[12]); + data = _mm_aesenc_si128(data, reinterpret_cast(roundKeys)[13]); + data = _mm_aesenclast_si128(data, reinterpret_cast(roundKeys)[14]); + *reinterpret_cast<__m128i*>(out) = data; + } - void aes_prepare_decryption_128(uint8_t w[176]) noexcept - { - reinterpret_cast<__m128i*>(w)[1] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[1]); - reinterpret_cast<__m128i*>(w)[2] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[2]); - reinterpret_cast<__m128i*>(w)[3] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[3]); - reinterpret_cast<__m128i*>(w)[4] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[4]); - reinterpret_cast<__m128i*>(w)[5] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[5]); - reinterpret_cast<__m128i*>(w)[6] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[6]); - reinterpret_cast<__m128i*>(w)[7] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[7]); - reinterpret_cast<__m128i*>(w)[8] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[8]); - reinterpret_cast<__m128i*>(w)[9] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[9]); - } + void aes_prepare_decryption_128(uint8_t w[176]) noexcept + { + reinterpret_cast<__m128i*>(w)[1] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[1]); + reinterpret_cast<__m128i*>(w)[2] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[2]); + reinterpret_cast<__m128i*>(w)[3] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[3]); + reinterpret_cast<__m128i*>(w)[4] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[4]); + reinterpret_cast<__m128i*>(w)[5] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[5]); + reinterpret_cast<__m128i*>(w)[6] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[6]); + reinterpret_cast<__m128i*>(w)[7] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[7]); + reinterpret_cast<__m128i*>(w)[8] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[8]); + reinterpret_cast<__m128i*>(w)[9] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[9]); + } - void aes_prepare_decryption_192(uint8_t w[208]) noexcept - { - reinterpret_cast<__m128i*>(w)[1] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[1]); - reinterpret_cast<__m128i*>(w)[2] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[2]); - reinterpret_cast<__m128i*>(w)[3] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[3]); - reinterpret_cast<__m128i*>(w)[4] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[4]); - reinterpret_cast<__m128i*>(w)[5] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[5]); - reinterpret_cast<__m128i*>(w)[6] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[6]); - reinterpret_cast<__m128i*>(w)[7] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[7]); - reinterpret_cast<__m128i*>(w)[8] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[8]); - reinterpret_cast<__m128i*>(w)[9] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[9]); - reinterpret_cast<__m128i*>(w)[10] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[10]); - reinterpret_cast<__m128i*>(w)[11] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[11]); - } + void aes_prepare_decryption_192(uint8_t w[208]) noexcept + { + reinterpret_cast<__m128i*>(w)[1] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[1]); + reinterpret_cast<__m128i*>(w)[2] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[2]); + reinterpret_cast<__m128i*>(w)[3] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[3]); + reinterpret_cast<__m128i*>(w)[4] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[4]); + reinterpret_cast<__m128i*>(w)[5] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[5]); + reinterpret_cast<__m128i*>(w)[6] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[6]); + reinterpret_cast<__m128i*>(w)[7] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[7]); + reinterpret_cast<__m128i*>(w)[8] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[8]); + reinterpret_cast<__m128i*>(w)[9] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[9]); + reinterpret_cast<__m128i*>(w)[10] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[10]); + reinterpret_cast<__m128i*>(w)[11] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[11]); + } - void aes_prepare_decryption_256(uint8_t w[240]) noexcept - { - reinterpret_cast<__m128i*>(w)[1] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[1]); - reinterpret_cast<__m128i*>(w)[2] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[2]); - reinterpret_cast<__m128i*>(w)[3] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[3]); - reinterpret_cast<__m128i*>(w)[4] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[4]); - reinterpret_cast<__m128i*>(w)[5] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[5]); - reinterpret_cast<__m128i*>(w)[6] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[6]); - reinterpret_cast<__m128i*>(w)[7] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[7]); - reinterpret_cast<__m128i*>(w)[8] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[8]); - reinterpret_cast<__m128i*>(w)[9] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[9]); - reinterpret_cast<__m128i*>(w)[10] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[10]); - reinterpret_cast<__m128i*>(w)[11] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[11]); - reinterpret_cast<__m128i*>(w)[12] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[12]); - reinterpret_cast<__m128i*>(w)[13] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[13]); - } + void aes_prepare_decryption_256(uint8_t w[240]) noexcept + { + reinterpret_cast<__m128i*>(w)[1] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[1]); + reinterpret_cast<__m128i*>(w)[2] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[2]); + reinterpret_cast<__m128i*>(w)[3] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[3]); + reinterpret_cast<__m128i*>(w)[4] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[4]); + reinterpret_cast<__m128i*>(w)[5] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[5]); + reinterpret_cast<__m128i*>(w)[6] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[6]); + reinterpret_cast<__m128i*>(w)[7] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[7]); + reinterpret_cast<__m128i*>(w)[8] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[8]); + reinterpret_cast<__m128i*>(w)[9] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[9]); + reinterpret_cast<__m128i*>(w)[10] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[10]); + reinterpret_cast<__m128i*>(w)[11] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[11]); + reinterpret_cast<__m128i*>(w)[12] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[12]); + reinterpret_cast<__m128i*>(w)[13] = _mm_aesimc_si128(reinterpret_cast<__m128i*>(w)[13]); + } - void aes_decrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept - { - __m128i data = *reinterpret_cast(in); - data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[10]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[9]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[8]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[7]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[6]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[5]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[4]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[3]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[2]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[1]); - data = _mm_aesdeclast_si128(data, reinterpret_cast(roundKeys)[0]); - *reinterpret_cast<__m128i*>(out) = data; - } + void aes_decrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept + { + __m128i data = *reinterpret_cast(in); + data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[10]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[9]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[8]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[7]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[6]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[5]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[4]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[3]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[2]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[1]); + data = _mm_aesdeclast_si128(data, reinterpret_cast(roundKeys)[0]); + *reinterpret_cast<__m128i*>(out) = data; + } - void aes_decrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept - { - __m128i data = *reinterpret_cast(in); - data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[12]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[11]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[10]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[9]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[8]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[7]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[6]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[5]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[4]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[3]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[2]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[1]); - data = _mm_aesdeclast_si128(data, reinterpret_cast(roundKeys)[0]); - *reinterpret_cast<__m128i*>(out) = data; - } + void aes_decrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept + { + __m128i data = *reinterpret_cast(in); + data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[12]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[11]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[10]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[9]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[8]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[7]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[6]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[5]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[4]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[3]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[2]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[1]); + data = _mm_aesdeclast_si128(data, reinterpret_cast(roundKeys)[0]); + *reinterpret_cast<__m128i*>(out) = data; + } - void aes_decrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept - { - __m128i data = *reinterpret_cast(in); - data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[14]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[13]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[12]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[11]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[10]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[9]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[8]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[7]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[6]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[5]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[4]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[3]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[2]); - data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[1]); - data = _mm_aesdeclast_si128(data, reinterpret_cast(roundKeys)[0]); - *reinterpret_cast<__m128i*>(out) = data; - } -#elif defined(__aarch64__) || defined(_M_ARM64) - void aes_encrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept - { - auto data = vld1q_u8(in); - data = vaeseq_u8(data, vld1q_u8(&roundKeys[0 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[1 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[2 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[3 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[4 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[5 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[6 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[7 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[8 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[9 * 16])); - data = veorq_u8(data, vld1q_u8(&roundKeys[10 * 16])); - vst1q_u8(out, data); - } + void aes_decrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept + { + __m128i data = *reinterpret_cast(in); + data = _mm_xor_si128(data, reinterpret_cast(roundKeys)[14]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[13]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[12]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[11]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[10]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[9]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[8]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[7]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[6]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[5]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[4]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[3]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[2]); + data = _mm_aesdec_si128(data, reinterpret_cast(roundKeys)[1]); + data = _mm_aesdeclast_si128(data, reinterpret_cast(roundKeys)[0]); + *reinterpret_cast<__m128i*>(out) = data; + } +#elif SOUP_ARM + void aes_encrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept + { + auto data = vld1q_u8(in); + data = vaeseq_u8(data, vld1q_u8(&roundKeys[0 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[1 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[2 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[3 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[4 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[5 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[6 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[7 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[8 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[9 * 16])); + data = veorq_u8(data, vld1q_u8(&roundKeys[10 * 16])); + vst1q_u8(out, data); + } - void aes_encrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept - { - auto data = vld1q_u8(in); - data = vaeseq_u8(data, vld1q_u8(&roundKeys[0 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[1 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[2 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[3 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[4 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[5 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[6 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[7 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[8 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[9 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[10 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[11 * 16])); - data = veorq_u8(data, vld1q_u8(&roundKeys[12 * 16])); - vst1q_u8(out, data); - } + void aes_encrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept + { + auto data = vld1q_u8(in); + data = vaeseq_u8(data, vld1q_u8(&roundKeys[0 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[1 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[2 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[3 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[4 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[5 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[6 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[7 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[8 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[9 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[10 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[11 * 16])); + data = veorq_u8(data, vld1q_u8(&roundKeys[12 * 16])); + vst1q_u8(out, data); + } - void aes_encrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept - { - auto data = vld1q_u8(in); - data = vaeseq_u8(data, vld1q_u8(&roundKeys[0 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[1 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[2 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[3 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[4 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[5 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[6 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[7 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[8 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[9 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[10 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[11 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[12 * 16])); - data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[13 * 16])); - data = veorq_u8(data, vld1q_u8(&roundKeys[14 * 16])); - vst1q_u8(out, data); - } + void aes_encrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept + { + auto data = vld1q_u8(in); + data = vaeseq_u8(data, vld1q_u8(&roundKeys[0 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[1 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[2 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[3 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[4 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[5 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[6 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[7 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[8 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[9 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[10 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[11 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[12 * 16])); + data = vaeseq_u8(vaesmcq_u8(data), vld1q_u8(&roundKeys[13 * 16])); + data = veorq_u8(data, vld1q_u8(&roundKeys[14 * 16])); + vst1q_u8(out, data); + } - void aes_prepare_decryption_128(uint8_t w[176]) noexcept - { - vst1q_u8(&w[1 * 16], vaesimcq_u8(vld1q_u8(&w[1 * 16]))); - vst1q_u8(&w[2 * 16], vaesimcq_u8(vld1q_u8(&w[2 * 16]))); - vst1q_u8(&w[3 * 16], vaesimcq_u8(vld1q_u8(&w[3 * 16]))); - vst1q_u8(&w[4 * 16], vaesimcq_u8(vld1q_u8(&w[4 * 16]))); - vst1q_u8(&w[5 * 16], vaesimcq_u8(vld1q_u8(&w[5 * 16]))); - vst1q_u8(&w[6 * 16], vaesimcq_u8(vld1q_u8(&w[6 * 16]))); - vst1q_u8(&w[7 * 16], vaesimcq_u8(vld1q_u8(&w[7 * 16]))); - vst1q_u8(&w[8 * 16], vaesimcq_u8(vld1q_u8(&w[8 * 16]))); - vst1q_u8(&w[9 * 16], vaesimcq_u8(vld1q_u8(&w[9 * 16]))); - } + void aes_prepare_decryption_128(uint8_t w[176]) noexcept + { + vst1q_u8(&w[1 * 16], vaesimcq_u8(vld1q_u8(&w[1 * 16]))); + vst1q_u8(&w[2 * 16], vaesimcq_u8(vld1q_u8(&w[2 * 16]))); + vst1q_u8(&w[3 * 16], vaesimcq_u8(vld1q_u8(&w[3 * 16]))); + vst1q_u8(&w[4 * 16], vaesimcq_u8(vld1q_u8(&w[4 * 16]))); + vst1q_u8(&w[5 * 16], vaesimcq_u8(vld1q_u8(&w[5 * 16]))); + vst1q_u8(&w[6 * 16], vaesimcq_u8(vld1q_u8(&w[6 * 16]))); + vst1q_u8(&w[7 * 16], vaesimcq_u8(vld1q_u8(&w[7 * 16]))); + vst1q_u8(&w[8 * 16], vaesimcq_u8(vld1q_u8(&w[8 * 16]))); + vst1q_u8(&w[9 * 16], vaesimcq_u8(vld1q_u8(&w[9 * 16]))); + } - void aes_prepare_decryption_192(uint8_t w[208]) noexcept - { - vst1q_u8(&w[1 * 16], vaesimcq_u8(vld1q_u8(&w[1 * 16]))); - vst1q_u8(&w[2 * 16], vaesimcq_u8(vld1q_u8(&w[2 * 16]))); - vst1q_u8(&w[3 * 16], vaesimcq_u8(vld1q_u8(&w[3 * 16]))); - vst1q_u8(&w[4 * 16], vaesimcq_u8(vld1q_u8(&w[4 * 16]))); - vst1q_u8(&w[5 * 16], vaesimcq_u8(vld1q_u8(&w[5 * 16]))); - vst1q_u8(&w[6 * 16], vaesimcq_u8(vld1q_u8(&w[6 * 16]))); - vst1q_u8(&w[7 * 16], vaesimcq_u8(vld1q_u8(&w[7 * 16]))); - vst1q_u8(&w[8 * 16], vaesimcq_u8(vld1q_u8(&w[8 * 16]))); - vst1q_u8(&w[9 * 16], vaesimcq_u8(vld1q_u8(&w[9 * 16]))); - vst1q_u8(&w[10 * 16], vaesimcq_u8(vld1q_u8(&w[10 * 16]))); - vst1q_u8(&w[11 * 16], vaesimcq_u8(vld1q_u8(&w[11 * 16]))); - } + void aes_prepare_decryption_192(uint8_t w[208]) noexcept + { + vst1q_u8(&w[1 * 16], vaesimcq_u8(vld1q_u8(&w[1 * 16]))); + vst1q_u8(&w[2 * 16], vaesimcq_u8(vld1q_u8(&w[2 * 16]))); + vst1q_u8(&w[3 * 16], vaesimcq_u8(vld1q_u8(&w[3 * 16]))); + vst1q_u8(&w[4 * 16], vaesimcq_u8(vld1q_u8(&w[4 * 16]))); + vst1q_u8(&w[5 * 16], vaesimcq_u8(vld1q_u8(&w[5 * 16]))); + vst1q_u8(&w[6 * 16], vaesimcq_u8(vld1q_u8(&w[6 * 16]))); + vst1q_u8(&w[7 * 16], vaesimcq_u8(vld1q_u8(&w[7 * 16]))); + vst1q_u8(&w[8 * 16], vaesimcq_u8(vld1q_u8(&w[8 * 16]))); + vst1q_u8(&w[9 * 16], vaesimcq_u8(vld1q_u8(&w[9 * 16]))); + vst1q_u8(&w[10 * 16], vaesimcq_u8(vld1q_u8(&w[10 * 16]))); + vst1q_u8(&w[11 * 16], vaesimcq_u8(vld1q_u8(&w[11 * 16]))); + } - void aes_prepare_decryption_256(uint8_t w[240]) noexcept - { - vst1q_u8(&w[1 * 16], vaesimcq_u8(vld1q_u8(&w[1 * 16]))); - vst1q_u8(&w[2 * 16], vaesimcq_u8(vld1q_u8(&w[2 * 16]))); - vst1q_u8(&w[3 * 16], vaesimcq_u8(vld1q_u8(&w[3 * 16]))); - vst1q_u8(&w[4 * 16], vaesimcq_u8(vld1q_u8(&w[4 * 16]))); - vst1q_u8(&w[5 * 16], vaesimcq_u8(vld1q_u8(&w[5 * 16]))); - vst1q_u8(&w[6 * 16], vaesimcq_u8(vld1q_u8(&w[6 * 16]))); - vst1q_u8(&w[7 * 16], vaesimcq_u8(vld1q_u8(&w[7 * 16]))); - vst1q_u8(&w[8 * 16], vaesimcq_u8(vld1q_u8(&w[8 * 16]))); - vst1q_u8(&w[9 * 16], vaesimcq_u8(vld1q_u8(&w[9 * 16]))); - vst1q_u8(&w[10 * 16], vaesimcq_u8(vld1q_u8(&w[10 * 16]))); - vst1q_u8(&w[11 * 16], vaesimcq_u8(vld1q_u8(&w[11 * 16]))); - vst1q_u8(&w[12 * 16], vaesimcq_u8(vld1q_u8(&w[12 * 16]))); - vst1q_u8(&w[13 * 16], vaesimcq_u8(vld1q_u8(&w[13 * 16]))); - } + void aes_prepare_decryption_256(uint8_t w[240]) noexcept + { + vst1q_u8(&w[1 * 16], vaesimcq_u8(vld1q_u8(&w[1 * 16]))); + vst1q_u8(&w[2 * 16], vaesimcq_u8(vld1q_u8(&w[2 * 16]))); + vst1q_u8(&w[3 * 16], vaesimcq_u8(vld1q_u8(&w[3 * 16]))); + vst1q_u8(&w[4 * 16], vaesimcq_u8(vld1q_u8(&w[4 * 16]))); + vst1q_u8(&w[5 * 16], vaesimcq_u8(vld1q_u8(&w[5 * 16]))); + vst1q_u8(&w[6 * 16], vaesimcq_u8(vld1q_u8(&w[6 * 16]))); + vst1q_u8(&w[7 * 16], vaesimcq_u8(vld1q_u8(&w[7 * 16]))); + vst1q_u8(&w[8 * 16], vaesimcq_u8(vld1q_u8(&w[8 * 16]))); + vst1q_u8(&w[9 * 16], vaesimcq_u8(vld1q_u8(&w[9 * 16]))); + vst1q_u8(&w[10 * 16], vaesimcq_u8(vld1q_u8(&w[10 * 16]))); + vst1q_u8(&w[11 * 16], vaesimcq_u8(vld1q_u8(&w[11 * 16]))); + vst1q_u8(&w[12 * 16], vaesimcq_u8(vld1q_u8(&w[12 * 16]))); + vst1q_u8(&w[13 * 16], vaesimcq_u8(vld1q_u8(&w[13 * 16]))); + } - void aes_decrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept - { - auto data = vld1q_u8(in); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[10 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[9 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[8 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[7 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[6 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[5 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[4 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[3 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[2 * 16]))); - data = vaesdq_u8(data, vld1q_u8(&roundKeys[1 * 16])); - data = veorq_u8(data, vld1q_u8(&roundKeys[0 * 16])); - vst1q_u8(out, data); - } + void aes_decrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept + { + auto data = vld1q_u8(in); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[10 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[9 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[8 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[7 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[6 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[5 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[4 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[3 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[2 * 16]))); + data = vaesdq_u8(data, vld1q_u8(&roundKeys[1 * 16])); + data = veorq_u8(data, vld1q_u8(&roundKeys[0 * 16])); + vst1q_u8(out, data); + } - void aes_decrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept - { - auto data = vld1q_u8(in); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[12 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[11 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[10 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[9 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[8 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[7 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[6 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[5 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[4 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[3 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[2 * 16]))); - data = vaesdq_u8(data, vld1q_u8(&roundKeys[1 * 16])); - data = veorq_u8(data, vld1q_u8(&roundKeys[0 * 16])); - vst1q_u8(out, data); - } + void aes_decrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept + { + auto data = vld1q_u8(in); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[12 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[11 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[10 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[9 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[8 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[7 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[6 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[5 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[4 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[3 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[2 * 16]))); + data = vaesdq_u8(data, vld1q_u8(&roundKeys[1 * 16])); + data = veorq_u8(data, vld1q_u8(&roundKeys[0 * 16])); + vst1q_u8(out, data); + } - void aes_decrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept - { - auto data = vld1q_u8(in); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[14 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[13 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[12 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[11 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[10 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[9 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[8 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[7 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[6 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[5 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[4 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[3 * 16]))); - data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[2 * 16]))); - data = vaesdq_u8(data, vld1q_u8(&roundKeys[1 * 16])); - data = veorq_u8(data, vld1q_u8(&roundKeys[0 * 16])); - vst1q_u8(out, data); - } + void aes_decrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept + { + auto data = vld1q_u8(in); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[14 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[13 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[12 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[11 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[10 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[9 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[8 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[7 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[6 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[5 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[4 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[3 * 16]))); + data = vaesimcq_u8(vaesdq_u8(data, vld1q_u8(&roundKeys[2 * 16]))); + data = vaesdq_u8(data, vld1q_u8(&roundKeys[1 * 16])); + data = veorq_u8(data, vld1q_u8(&roundKeys[0 * 16])); + vst1q_u8(out, data); + } #endif + } } diff --git a/src/vendor/Soup/Intrin/crc32_intrin.cpp b/src/vendor/Soup/Intrin/crc32_intrin.cpp index 06e3338319..7ade7b6e5d 100644 --- a/src/vendor/Soup/Intrin/crc32_intrin.cpp +++ b/src/vendor/Soup/Intrin/crc32_intrin.cpp @@ -1,63 +1,68 @@ +#include "../soup/base.hpp" + #include #include -#if defined(__x86_64__) || defined(_M_X64) +#if SOUP_X86 #include // _mm_extract_epi32 #include // _mm_clmulepi64_si128 -#elif defined(__aarch64__) || defined(_M_ARM64) - #ifdef _WIN32 +#elif SOUP_ARM + #if SOUP_WINDOWS #include #else #include #endif #endif -namespace soup_intrin +NAMESPACE_SOUP { -#if defined(__x86_64__) || defined(_M_X64) - uint32_t crc32_pclmul(const uint8_t* p, size_t size, uint32_t crc) noexcept + namespace intrin { - // Original source: https://github.com/richgel999/fpng/blob/main/src/fpng.cpp - // Original licence: Dedicated to the public domain. +#if SOUP_X86 + uint32_t crc32_pclmul(const uint8_t* p, size_t size, uint32_t crc) noexcept + { + // Original source: https://github.com/richgel999/fpng/blob/main/src/fpng.cpp + // Original licence: Dedicated to the public domain. - static const uint64_t + static const uint64_t #ifdef _MSC_VER - __declspec(align(16)) + __declspec(align(16)) #else - __attribute__((aligned(16))) + __attribute__((aligned(16))) #endif - s_u[2] = { 0x1DB710641, 0x1F7011641 }, s_k5k0[2] = { 0x163CD6124, 0 }, s_k3k4[2] = { 0x1751997D0, 0xCCAA009E }; + s_u[2] = { 0x1DB710641, 0x1F7011641 }, s_k5k0[2] = { 0x163CD6124, 0 }, s_k3k4[2] = { 0x1751997D0, 0xCCAA009E }; - // Load first 16 bytes, apply initial CRC32 - __m128i b = _mm_xor_si128(_mm_cvtsi32_si128(~crc), _mm_loadu_si128(reinterpret_cast(p))); + // Load first 16 bytes, apply initial CRC32 + __m128i b = _mm_xor_si128(_mm_cvtsi32_si128(~crc), _mm_loadu_si128(reinterpret_cast(p))); - // We're skipping directly to Step 2 page 12 - iteratively folding by 1 (by 4 is overkill for our needs) - const __m128i k3k4 = _mm_load_si128(reinterpret_cast(s_k3k4)); + // We're skipping directly to Step 2 page 12 - iteratively folding by 1 (by 4 is overkill for our needs) + const __m128i k3k4 = _mm_load_si128(reinterpret_cast(s_k3k4)); - for (size -= 16, p += 16; size >= 16; size -= 16, p += 16) - b = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(b, k3k4, 17), _mm_loadu_si128(reinterpret_cast(p))), _mm_clmulepi64_si128(b, k3k4, 0)); + for (size -= 16, p += 16; size >= 16; size -= 16, p += 16) + b = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(b, k3k4, 17), _mm_loadu_si128(reinterpret_cast(p))), _mm_clmulepi64_si128(b, k3k4, 0)); - // Final stages: fold to 64-bits, 32-bit Barrett reduction - const __m128i z = _mm_set_epi32(0, ~0, 0, ~0), u = _mm_load_si128(reinterpret_cast(s_u)); - b = _mm_xor_si128(_mm_srli_si128(b, 8), _mm_clmulepi64_si128(b, k3k4, 16)); - b = _mm_xor_si128(_mm_clmulepi64_si128(_mm_and_si128(b, z), _mm_loadl_epi64(reinterpret_cast(s_k5k0)), 0), _mm_srli_si128(b, 4)); - return ~_mm_extract_epi32(_mm_xor_si128(b, _mm_clmulepi64_si128(_mm_and_si128(_mm_clmulepi64_si128(_mm_and_si128(b, z), u, 16), z), u, 0)), 1); - } -#elif defined(__aarch64__) || defined(_M_ARM64) - uint32_t crc32_armv8(const uint8_t* p, size_t size, uint32_t crc) noexcept - { - crc = ~crc; - for (; size >= 8; size -= 8) - { - crc = __crc32d(crc, *reinterpret_cast(p)); - p += 8; + // Final stages: fold to 64-bits, 32-bit Barrett reduction + const __m128i z = _mm_set_epi32(0, ~0, 0, ~0), u = _mm_load_si128(reinterpret_cast(s_u)); + b = _mm_xor_si128(_mm_srli_si128(b, 8), _mm_clmulepi64_si128(b, k3k4, 16)); + b = _mm_xor_si128(_mm_clmulepi64_si128(_mm_and_si128(b, z), _mm_loadl_epi64(reinterpret_cast(s_k5k0)), 0), _mm_srli_si128(b, 4)); + return ~_mm_extract_epi32(_mm_xor_si128(b, _mm_clmulepi64_si128(_mm_and_si128(_mm_clmulepi64_si128(_mm_and_si128(b, z), u, 16), z), u, 0)), 1); } - while (size--) +#elif SOUP_ARM + uint32_t crc32_armv8(const uint8_t* p, size_t size, uint32_t crc) noexcept { - crc = __crc32b(crc, *p++); + crc = ~crc; + for (; size >= 8; size -= 8) + { + crc = __crc32d(crc, *reinterpret_cast(p)); + p += 8; + } + while (size--) + { + crc = __crc32b(crc, *p++); + } + crc = ~crc; + return crc; } - crc = ~crc; - return crc; - } #endif + } } diff --git a/src/vendor/Soup/Intrin/hardware_rng.cpp b/src/vendor/Soup/Intrin/hardware_rng.cpp index 1b0354b046..fc8dd8a7c2 100644 --- a/src/vendor/Soup/Intrin/hardware_rng.cpp +++ b/src/vendor/Soup/Intrin/hardware_rng.cpp @@ -1,62 +1,38 @@ -#if defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(__i386__) || defined(__i386) || defined(_M_IX86) +#include "../soup/base.hpp" +#if SOUP_X86 #include #include -namespace soup_intrin +NAMESPACE_SOUP { - // RDSEED - - uint16_t hardware_rng_generate16() noexcept + namespace intrin { - uint16_t res; - while (_rdseed16_step(&res) == 0); - return res; - } - - uint32_t hardware_rng_generate32() noexcept - { - uint32_t res; - while (_rdseed32_step(&res) == 0); - return res; - } - -#if defined(__x86_64__) || defined(_M_X64) - uint64_t hardware_rng_generate64() noexcept - { - unsigned long long res; - while (_rdseed64_step(&res) == 0); - return res; - } - static_assert(sizeof(uint64_t) == sizeof(unsigned long long)); + uint16_t hardware_rng_generate16() noexcept + { + uint16_t res; + while (_rdseed16_step(&res) == 0); + return res; + } + + uint32_t hardware_rng_generate32() noexcept + { + uint32_t res; + while (_rdseed32_step(&res) == 0); + return res; + } + +#if SOUP_BITS == 64 + uint64_t hardware_rng_generate64() noexcept + { + unsigned long long res; + while (_rdseed64_step(&res) == 0); + return res; + } + static_assert(sizeof(uint64_t) == sizeof(unsigned long long)); #endif - - // RDRAND - - uint16_t fast_hardware_rng_generate16() noexcept - { - uint16_t res; - while (_rdrand16_step(&res) == 0); - return res; - } - - uint32_t fast_hardware_rng_generate32() noexcept - { - uint32_t res; - while (_rdrand32_step(&res) == 0); - return res; } - -#if defined(__x86_64__) || defined(_M_X64) - uint64_t fast_hardware_rng_generate64() noexcept - { - unsigned long long res; - while (_rdrand64_step(&res) == 0); - return res; - } - static_assert(sizeof(uint64_t) == sizeof(unsigned long long)); -#endif } #endif diff --git a/src/vendor/Soup/Intrin/sha1_transform.cpp b/src/vendor/Soup/Intrin/sha1_transform.cpp index 77c2b402f5..9d5fbe1c47 100644 --- a/src/vendor/Soup/Intrin/sha1_transform.cpp +++ b/src/vendor/Soup/Intrin/sha1_transform.cpp @@ -1,361 +1,366 @@ +#include "../soup/base.hpp" + #include -#if defined(__x86_64__) || defined(_M_X64) +#if SOUP_X86 #include -#elif defined(__aarch64__) || defined(_M_ARM64) +#elif SOUP_ARM #include #endif -namespace soup_intrin +NAMESPACE_SOUP { - // Original source: https://github.com/noloader/SHA-Intrinsics - // Original licence: Dedicated to the public domain. - -#if defined(__x86_64__) || defined(_M_X64) - void sha1_transform(uint32_t state[5], const uint8_t data[64]) noexcept - { - __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; - __m128i MSG0, MSG1, MSG2, MSG3; - const __m128i MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); - - /* Load initial values */ - ABCD = _mm_loadu_si128((const __m128i*) state); - E0 = _mm_set_epi32(state[4], 0, 0, 0); - ABCD = _mm_shuffle_epi32(ABCD, 0x1B); - - /* Save current state */ - ABCD_SAVE = ABCD; - E0_SAVE = E0; - - /* Rounds 0-3 */ - MSG0 = _mm_loadu_si128((const __m128i*)(data + 0)); - MSG0 = _mm_shuffle_epi8(MSG0, MASK); - E0 = _mm_add_epi32(E0, MSG0); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - - /* Rounds 4-7 */ - MSG1 = _mm_loadu_si128((const __m128i*)(data + 16)); - MSG1 = _mm_shuffle_epi8(MSG1, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - - /* Rounds 8-11 */ - MSG2 = _mm_loadu_si128((const __m128i*)(data + 32)); - MSG2 = _mm_shuffle_epi8(MSG2, MASK); - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - /* Rounds 12-15 */ - MSG3 = _mm_loadu_si128((const __m128i*)(data + 48)); - MSG3 = _mm_shuffle_epi8(MSG3, MASK); - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - /* Rounds 16-19 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - /* Rounds 20-23 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - /* Rounds 24-27 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - /* Rounds 28-31 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - /* Rounds 32-35 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - /* Rounds 36-39 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - /* Rounds 40-43 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - /* Rounds 44-47 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - /* Rounds 48-51 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - /* Rounds 52-55 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); - MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - /* Rounds 56-59 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); - MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); - MSG0 = _mm_xor_si128(MSG0, MSG2); - - /* Rounds 60-63 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); - MSG1 = _mm_xor_si128(MSG1, MSG3); - - /* Rounds 64-67 */ - E0 = _mm_sha1nexte_epu32(E0, MSG0); - E1 = ABCD; - MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); - MSG2 = _mm_xor_si128(MSG2, MSG0); - - /* Rounds 68-71 */ - E1 = _mm_sha1nexte_epu32(E1, MSG1); - E0 = ABCD; - MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - MSG3 = _mm_xor_si128(MSG3, MSG1); - - /* Rounds 72-75 */ - E0 = _mm_sha1nexte_epu32(E0, MSG2); - E1 = ABCD; - MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); - ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); - - /* Rounds 76-79 */ - E1 = _mm_sha1nexte_epu32(E1, MSG3); - E0 = ABCD; - ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); - - /* Combine state */ - E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); - ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); - - /* Save state */ - ABCD = _mm_shuffle_epi32(ABCD, 0x1B); - _mm_storeu_si128((__m128i*) state, ABCD); - state[4] = _mm_extract_epi32(E0, 3); - } -#elif defined(__aarch64__) || defined(_M_ARM64) - void sha1_transform(uint32_t state[5], const uint8_t data[64]) noexcept + namespace intrin { - uint32x4_t ABCD, ABCD_SAVED; - uint32x4_t TMP0, TMP1; - uint32x4_t MSG0, MSG1, MSG2, MSG3; - uint32_t E0, E0_SAVED, E1; - - /* Load state */ - ABCD = vld1q_u32(&state[0]); - E0 = state[4]; - - /* Save state */ - ABCD_SAVED = ABCD; - E0_SAVED = E0; - - /* Load message */ - MSG0 = vld1q_u32((const uint32_t*)(data)); - MSG1 = vld1q_u32((const uint32_t*)(data + 16)); - MSG2 = vld1q_u32((const uint32_t*)(data + 32)); - MSG3 = vld1q_u32((const uint32_t*)(data + 48)); - - TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x5A827999)); - TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x5A827999)); - - /* Rounds 0-3 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x5A827999)); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - /* Rounds 4-7 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x5A827999)); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - /* Rounds 8-11 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x5A827999)); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - /* Rounds 12-15 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - /* Rounds 16-19 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1cq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x6ED9EBA1)); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - /* Rounds 20-23 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x6ED9EBA1)); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - /* Rounds 24-27 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x6ED9EBA1)); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - /* Rounds 28-31 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - /* Rounds 32-35 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - /* Rounds 36-39 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x8F1BBCDC)); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - /* Rounds 40-43 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x8F1BBCDC)); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - /* Rounds 44-47 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x8F1BBCDC)); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - /* Rounds 48-51 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - /* Rounds 52-55 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); - - /* Rounds 56-59 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1mq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0xCA62C1D6)); - MSG1 = vsha1su1q_u32(MSG1, MSG0); - MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); - - /* Rounds 60-63 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0xCA62C1D6)); - MSG2 = vsha1su1q_u32(MSG2, MSG1); - MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); - - /* Rounds 64-67 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0xCA62C1D6)); - MSG3 = vsha1su1q_u32(MSG3, MSG2); - MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); - - /* Rounds 68-71 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); - MSG0 = vsha1su1q_u32(MSG0, MSG3); - - /* Rounds 72-75 */ - E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E0, TMP0); - - /* Rounds 76-79 */ - E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); - ABCD = vsha1pq_u32(ABCD, E1, TMP1); - - /* Combine state */ - E0 += E0_SAVED; - ABCD = vaddq_u32(ABCD_SAVED, ABCD); - - /* Save state */ - vst1q_u32(&state[0], ABCD); - state[4] = E0; - } + // Original source: https://github.com/noloader/SHA-Intrinsics + // Original licence: Dedicated to the public domain. + +#if SOUP_X86 + void sha1_transform(uint32_t state[5], const uint8_t data[64]) noexcept + { + __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1; + __m128i MSG0, MSG1, MSG2, MSG3; + const __m128i MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL); + + /* Load initial values */ + ABCD = _mm_loadu_si128((const __m128i*) state); + E0 = _mm_set_epi32(state[4], 0, 0, 0); + ABCD = _mm_shuffle_epi32(ABCD, 0x1B); + + /* Save current state */ + ABCD_SAVE = ABCD; + E0_SAVE = E0; + + /* Rounds 0-3 */ + MSG0 = _mm_loadu_si128((const __m128i*)(data + 0)); + MSG0 = _mm_shuffle_epi8(MSG0, MASK); + E0 = _mm_add_epi32(E0, MSG0); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + + /* Rounds 4-7 */ + MSG1 = _mm_loadu_si128((const __m128i*)(data + 16)); + MSG1 = _mm_shuffle_epi8(MSG1, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + + /* Rounds 8-11 */ + MSG2 = _mm_loadu_si128((const __m128i*)(data + 32)); + MSG2 = _mm_shuffle_epi8(MSG2, MASK); + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 12-15 */ + MSG3 = _mm_loadu_si128((const __m128i*)(data + 48)); + MSG3 = _mm_shuffle_epi8(MSG3, MASK); + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 16-19 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 20-23 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 24-27 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 28-31 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 32-35 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 36-39 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 40-43 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 44-47 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 48-51 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 52-55 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2); + MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 56-59 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2); + MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2); + MSG0 = _mm_xor_si128(MSG0, MSG2); + + /* Rounds 60-63 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3); + MSG1 = _mm_xor_si128(MSG1, MSG3); + + /* Rounds 64-67 */ + E0 = _mm_sha1nexte_epu32(E0, MSG0); + E1 = ABCD; + MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0); + MSG2 = _mm_xor_si128(MSG2, MSG0); + + /* Rounds 68-71 */ + E1 = _mm_sha1nexte_epu32(E1, MSG1); + E0 = ABCD; + MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1); + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + MSG3 = _mm_xor_si128(MSG3, MSG1); + + /* Rounds 72-75 */ + E0 = _mm_sha1nexte_epu32(E0, MSG2); + E1 = ABCD; + MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2); + ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3); + + /* Rounds 76-79 */ + E1 = _mm_sha1nexte_epu32(E1, MSG3); + E0 = ABCD; + ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3); + + /* Combine state */ + E0 = _mm_sha1nexte_epu32(E0, E0_SAVE); + ABCD = _mm_add_epi32(ABCD, ABCD_SAVE); + + /* Save state */ + ABCD = _mm_shuffle_epi32(ABCD, 0x1B); + _mm_storeu_si128((__m128i*) state, ABCD); + state[4] = _mm_extract_epi32(E0, 3); + } +#elif SOUP_ARM + void sha1_transform(uint32_t state[5], const uint8_t data[64]) noexcept + { + uint32x4_t ABCD, ABCD_SAVED; + uint32x4_t TMP0, TMP1; + uint32x4_t MSG0, MSG1, MSG2, MSG3; + uint32_t E0, E0_SAVED, E1; + + /* Load state */ + ABCD = vld1q_u32(&state[0]); + E0 = state[4]; + + /* Save state */ + ABCD_SAVED = ABCD; + E0_SAVED = E0; + + /* Load message */ + MSG0 = vld1q_u32((const uint32_t*)(data)); + MSG1 = vld1q_u32((const uint32_t*)(data + 16)); + MSG2 = vld1q_u32((const uint32_t*)(data + 32)); + MSG3 = vld1q_u32((const uint32_t*)(data + 48)); + + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x5A827999)); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x5A827999)); + + /* Rounds 0-3 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x5A827999)); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /* Rounds 4-7 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x5A827999)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /* Rounds 8-11 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x5A827999)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /* Rounds 12-15 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /* Rounds 16-19 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1cq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x6ED9EBA1)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /* Rounds 20-23 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x6ED9EBA1)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /* Rounds 24-27 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x6ED9EBA1)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /* Rounds 28-31 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x6ED9EBA1)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /* Rounds 32-35 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /* Rounds 36-39 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0x8F1BBCDC)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /* Rounds 40-43 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0x8F1BBCDC)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /* Rounds 44-47 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0x8F1BBCDC)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /* Rounds 48-51 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0x8F1BBCDC)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /* Rounds 52-55 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3); + + /* Rounds 56-59 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1mq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG0, vdupq_n_u32(0xCA62C1D6)); + MSG1 = vsha1su1q_u32(MSG1, MSG0); + MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0); + + /* Rounds 60-63 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG1, vdupq_n_u32(0xCA62C1D6)); + MSG2 = vsha1su1q_u32(MSG2, MSG1); + MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1); + + /* Rounds 64-67 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + TMP0 = vaddq_u32(MSG2, vdupq_n_u32(0xCA62C1D6)); + MSG3 = vsha1su1q_u32(MSG3, MSG2); + MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2); + + /* Rounds 68-71 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + TMP1 = vaddq_u32(MSG3, vdupq_n_u32(0xCA62C1D6)); + MSG0 = vsha1su1q_u32(MSG0, MSG3); + + /* Rounds 72-75 */ + E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E0, TMP0); + + /* Rounds 76-79 */ + E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0)); + ABCD = vsha1pq_u32(ABCD, E1, TMP1); + + /* Combine state */ + E0 += E0_SAVED; + ABCD = vaddq_u32(ABCD_SAVED, ABCD); + + /* Save state */ + vst1q_u32(&state[0], ABCD); + state[4] = E0; + } #endif + } } diff --git a/src/vendor/Soup/Intrin/sha256_transform.cpp b/src/vendor/Soup/Intrin/sha256_transform.cpp index 8f58c3558f..4e7f0db64c 100644 --- a/src/vendor/Soup/Intrin/sha256_transform.cpp +++ b/src/vendor/Soup/Intrin/sha256_transform.cpp @@ -1,377 +1,382 @@ +#include "../soup/base.hpp" + #include -#if defined(__x86_64__) || defined(_M_X64) - #include -#elif defined(__aarch64__) || defined(_M_ARM64) - #include +#if SOUP_X86 +#include +#elif SOUP_ARM +#include #endif -namespace soup_intrin +NAMESPACE_SOUP { - // Original source: https://github.com/noloader/SHA-Intrinsics - // Original licence: Dedicated to the public domain. - -#if defined(__x86_64__) || defined(_M_X64) - void sha256_transform(uint32_t state[8], const uint8_t data[64]) noexcept - { - __m128i STATE0, STATE1; - __m128i MSG, TMP; - __m128i MSG0, MSG1, MSG2, MSG3; - __m128i ABEF_SAVE, CDGH_SAVE; - const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); - - /* Load initial values */ - TMP = _mm_loadu_si128((const __m128i*) &state[0]); - STATE1 = _mm_loadu_si128((const __m128i*) &state[4]); - - TMP = _mm_shuffle_epi32(TMP, 0xB1); /* CDAB */ - STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); /* EFGH */ - STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); /* ABEF */ - STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */ - - /* Save current state */ - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - /* Rounds 0-3 */ - MSG = _mm_loadu_si128((const __m128i*) (data + 0)); - MSG0 = _mm_shuffle_epi8(MSG, MASK); - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Rounds 4-7 */ - MSG1 = _mm_loadu_si128((const __m128i*) (data + 16)); - MSG1 = _mm_shuffle_epi8(MSG1, MASK); - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - - /* Rounds 8-11 */ - MSG2 = _mm_loadu_si128((const __m128i*) (data + 32)); - MSG2 = _mm_shuffle_epi8(MSG2, MASK); - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - - /* Rounds 12-15 */ - MSG3 = _mm_loadu_si128((const __m128i*) (data + 48)); - MSG3 = _mm_shuffle_epi8(MSG3, MASK); - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - - /* Rounds 16-19 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - - /* Rounds 20-23 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - - /* Rounds 24-27 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - - /* Rounds 28-31 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - - /* Rounds 32-35 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - - /* Rounds 36-39 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); - - /* Rounds 40-43 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); - - /* Rounds 44-47 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG3, MSG2, 4); - MSG0 = _mm_add_epi32(MSG0, TMP); - MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); - - /* Rounds 48-51 */ - MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG0, MSG3, 4); - MSG1 = _mm_add_epi32(MSG1, TMP); - MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); - - /* Rounds 52-55 */ - MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG1, MSG0, 4); - MSG2 = _mm_add_epi32(MSG2, TMP); - MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Rounds 56-59 */ - MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - TMP = _mm_alignr_epi8(MSG2, MSG1, 4); - MSG3 = _mm_add_epi32(MSG3, TMP); - MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Rounds 60-63 */ - MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); - STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); - MSG = _mm_shuffle_epi32(MSG, 0x0E); - STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); - - /* Combine state */ - STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); - STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); - - TMP = _mm_shuffle_epi32(STATE0, 0x1B); /* FEBA */ - STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); /* DCHG */ - STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */ - STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); /* ABEF */ - - /* Save state */ - _mm_storeu_si128((__m128i*) &state[0], STATE0); - _mm_storeu_si128((__m128i*) &state[4], STATE1); - } -#elif defined(__aarch64__) || defined(_M_ARM64) - inline const uint32_t sha256_k[8 * 8] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, - }; - - void sha256_transform(uint32_t state[8], const uint8_t data[64]) noexcept + namespace intrin { - uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; - uint32x4_t MSG0, MSG1, MSG2, MSG3; - uint32x4_t TMP0, TMP1, TMP2; - - /* Load state */ - STATE0 = vld1q_u32(&state[0]); - STATE1 = vld1q_u32(&state[4]); - - /* Save state */ - ABEF_SAVE = STATE0; - CDGH_SAVE = STATE1; - - /* Load message */ - MSG0 = vld1q_u32((const uint32_t*)(data + 0)); - MSG1 = vld1q_u32((const uint32_t*)(data + 16)); - MSG2 = vld1q_u32((const uint32_t*)(data + 32)); - MSG3 = vld1q_u32((const uint32_t*)(data + 48)); - - /* Reverse for little endian */ - MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); - MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); - MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); - MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); - - TMP0 = vaddq_u32(MSG0, vld1q_u32(&sha256_k[0x00])); - - /* Rounds 0-3 */ - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&sha256_k[0x04])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); - - /* Rounds 4-7 */ - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&sha256_k[0x08])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); - - /* Rounds 8-11 */ - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&sha256_k[0x0c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); - - /* Rounds 12-15 */ - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&sha256_k[0x10])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); - - /* Rounds 16-19 */ - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&sha256_k[0x14])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); - - /* Rounds 20-23 */ - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&sha256_k[0x18])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); - - /* Rounds 24-27 */ - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&sha256_k[0x1c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); - - /* Rounds 28-31 */ - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&sha256_k[0x20])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); - - /* Rounds 32-35 */ - MSG0 = vsha256su0q_u32(MSG0, MSG1); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&sha256_k[0x24])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); - - /* Rounds 36-39 */ - MSG1 = vsha256su0q_u32(MSG1, MSG2); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&sha256_k[0x28])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); - - /* Rounds 40-43 */ - MSG2 = vsha256su0q_u32(MSG2, MSG3); - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&sha256_k[0x2c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); - - /* Rounds 44-47 */ - MSG3 = vsha256su0q_u32(MSG3, MSG0); - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG0, vld1q_u32(&sha256_k[0x30])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); - - /* Rounds 48-51 */ - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG1, vld1q_u32(&sha256_k[0x34])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - - /* Rounds 52-55 */ - TMP2 = STATE0; - TMP0 = vaddq_u32(MSG2, vld1q_u32(&sha256_k[0x38])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - - /* Rounds 56-59 */ - TMP2 = STATE0; - TMP1 = vaddq_u32(MSG3, vld1q_u32(&sha256_k[0x3c])); - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); - - /* Rounds 60-63 */ - TMP2 = STATE0; - STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); - STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); - - /* Combine state */ - STATE0 = vaddq_u32(STATE0, ABEF_SAVE); - STATE1 = vaddq_u32(STATE1, CDGH_SAVE); - - /* Save state */ - vst1q_u32(&state[0], STATE0); - vst1q_u32(&state[4], STATE1); - } + // Original source: https://github.com/noloader/SHA-Intrinsics + // Original licence: Dedicated to the public domain. + +#if SOUP_X86 + void sha256_transform(uint32_t state[8], const uint8_t data[64]) noexcept + { + __m128i STATE0, STATE1; + __m128i MSG, TMP; + __m128i MSG0, MSG1, MSG2, MSG3; + __m128i ABEF_SAVE, CDGH_SAVE; + const __m128i MASK = _mm_set_epi64x(0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL); + + /* Load initial values */ + TMP = _mm_loadu_si128((const __m128i*) & state[0]); + STATE1 = _mm_loadu_si128((const __m128i*) & state[4]); + + TMP = _mm_shuffle_epi32(TMP, 0xB1); /* CDAB */ + STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); /* EFGH */ + STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); /* ABEF */ + STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); /* CDGH */ + + /* Save current state */ + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + /* Rounds 0-3 */ + MSG = _mm_loadu_si128((const __m128i*) (data + 0)); + MSG0 = _mm_shuffle_epi8(MSG, MASK); + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0xE9B5DBA5B5C0FBCFULL, 0x71374491428A2F98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 4-7 */ + MSG1 = _mm_loadu_si128((const __m128i*) (data + 16)); + MSG1 = _mm_shuffle_epi8(MSG1, MASK); + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0xAB1C5ED5923F82A4ULL, 0x59F111F13956C25BULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 8-11 */ + MSG2 = _mm_loadu_si128((const __m128i*) (data + 32)); + MSG2 = _mm_shuffle_epi8(MSG2, MASK); + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x550C7DC3243185BEULL, 0x12835B01D807AA98ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 12-15 */ + MSG3 = _mm_loadu_si128((const __m128i*) (data + 48)); + MSG3 = _mm_shuffle_epi8(MSG3, MASK); + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC19BF1749BDC06A7ULL, 0x80DEB1FE72BE5D74ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 16-19 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x240CA1CC0FC19DC6ULL, 0xEFBE4786E49B69C1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 20-23 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x76F988DA5CB0A9DCULL, 0x4A7484AA2DE92C6FULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 24-27 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xBF597FC7B00327C8ULL, 0xA831C66D983E5152ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 28-31 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x1429296706CA6351ULL, 0xD5A79147C6E00BF3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 32-35 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x53380D134D2C6DFCULL, 0x2E1B213827B70A85ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 36-39 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x92722C8581C2C92EULL, 0x766A0ABB650A7354ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG0 = _mm_sha256msg1_epu32(MSG0, MSG1); + + /* Rounds 40-43 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0xC76C51A3C24B8B70ULL, 0xA81A664BA2BFE8A1ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG1 = _mm_sha256msg1_epu32(MSG1, MSG2); + + /* Rounds 44-47 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0x106AA070F40E3585ULL, 0xD6990624D192E819ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG3, MSG2, 4); + MSG0 = _mm_add_epi32(MSG0, TMP); + MSG0 = _mm_sha256msg2_epu32(MSG0, MSG3); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG2 = _mm_sha256msg1_epu32(MSG2, MSG3); + + /* Rounds 48-51 */ + MSG = _mm_add_epi32(MSG0, _mm_set_epi64x(0x34B0BCB52748774CULL, 0x1E376C0819A4C116ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG0, MSG3, 4); + MSG1 = _mm_add_epi32(MSG1, TMP); + MSG1 = _mm_sha256msg2_epu32(MSG1, MSG0); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + MSG3 = _mm_sha256msg1_epu32(MSG3, MSG0); + + /* Rounds 52-55 */ + MSG = _mm_add_epi32(MSG1, _mm_set_epi64x(0x682E6FF35B9CCA4FULL, 0x4ED8AA4A391C0CB3ULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG1, MSG0, 4); + MSG2 = _mm_add_epi32(MSG2, TMP); + MSG2 = _mm_sha256msg2_epu32(MSG2, MSG1); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 56-59 */ + MSG = _mm_add_epi32(MSG2, _mm_set_epi64x(0x8CC7020884C87814ULL, 0x78A5636F748F82EEULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + TMP = _mm_alignr_epi8(MSG2, MSG1, 4); + MSG3 = _mm_add_epi32(MSG3, TMP); + MSG3 = _mm_sha256msg2_epu32(MSG3, MSG2); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Rounds 60-63 */ + MSG = _mm_add_epi32(MSG3, _mm_set_epi64x(0xC67178F2BEF9A3F7ULL, 0xA4506CEB90BEFFFAULL)); + STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG); + MSG = _mm_shuffle_epi32(MSG, 0x0E); + STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG); + + /* Combine state */ + STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE); + STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE); + + TMP = _mm_shuffle_epi32(STATE0, 0x1B); /* FEBA */ + STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); /* DCHG */ + STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); /* DCBA */ + STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); /* ABEF */ + + /* Save state */ + _mm_storeu_si128((__m128i*) & state[0], STATE0); + _mm_storeu_si128((__m128i*) & state[4], STATE1); + } +#elif SOUP_ARM + inline const uint32_t sha256_k[8 * 8] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, + }; + + void sha256_transform(uint32_t state[8], const uint8_t data[64]) noexcept + { + uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE; + uint32x4_t MSG0, MSG1, MSG2, MSG3; + uint32x4_t TMP0, TMP1, TMP2; + + /* Load state */ + STATE0 = vld1q_u32(&state[0]); + STATE1 = vld1q_u32(&state[4]); + + /* Save state */ + ABEF_SAVE = STATE0; + CDGH_SAVE = STATE1; + + /* Load message */ + MSG0 = vld1q_u32((const uint32_t*)(data + 0)); + MSG1 = vld1q_u32((const uint32_t*)(data + 16)); + MSG2 = vld1q_u32((const uint32_t*)(data + 32)); + MSG3 = vld1q_u32((const uint32_t*)(data + 48)); + + /* Reverse for little endian */ + MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0))); + MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1))); + MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2))); + MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3))); + + TMP0 = vaddq_u32(MSG0, vld1q_u32(&sha256_k[0x00])); + + /* Rounds 0-3 */ + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&sha256_k[0x04])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + /* Rounds 4-7 */ + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&sha256_k[0x08])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + /* Rounds 8-11 */ + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&sha256_k[0x0c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + /* Rounds 12-15 */ + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&sha256_k[0x10])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + /* Rounds 16-19 */ + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&sha256_k[0x14])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + /* Rounds 20-23 */ + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&sha256_k[0x18])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + /* Rounds 24-27 */ + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&sha256_k[0x1c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + /* Rounds 28-31 */ + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&sha256_k[0x20])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + /* Rounds 32-35 */ + MSG0 = vsha256su0q_u32(MSG0, MSG1); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&sha256_k[0x24])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3); + + /* Rounds 36-39 */ + MSG1 = vsha256su0q_u32(MSG1, MSG2); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&sha256_k[0x28])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0); + + /* Rounds 40-43 */ + MSG2 = vsha256su0q_u32(MSG2, MSG3); + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&sha256_k[0x2c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1); + + /* Rounds 44-47 */ + MSG3 = vsha256su0q_u32(MSG3, MSG0); + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG0, vld1q_u32(&sha256_k[0x30])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2); + + /* Rounds 48-51 */ + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG1, vld1q_u32(&sha256_k[0x34])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + + /* Rounds 52-55 */ + TMP2 = STATE0; + TMP0 = vaddq_u32(MSG2, vld1q_u32(&sha256_k[0x38])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + + /* Rounds 56-59 */ + TMP2 = STATE0; + TMP1 = vaddq_u32(MSG3, vld1q_u32(&sha256_k[0x3c])); + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0); + + /* Rounds 60-63 */ + TMP2 = STATE0; + STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1); + STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1); + + /* Combine state */ + STATE0 = vaddq_u32(STATE0, ABEF_SAVE); + STATE1 = vaddq_u32(STATE1, CDGH_SAVE); + + /* Save state */ + vst1q_u32(&state[0], STATE0); + vst1q_u32(&state[4], STATE1); + } #endif + } } diff --git a/src/vendor/Soup/soup/Buffer.hpp b/src/vendor/Soup/soup/Buffer.hpp index 03b76e244c..dff97ec6ac 100644 --- a/src/vendor/Soup/soup/Buffer.hpp +++ b/src/vendor/Soup/soup/Buffer.hpp @@ -160,10 +160,18 @@ NAMESPACE_SOUP { const auto s = m_size; grow(count); - memcpy(&m_data[count], &m_data[0], s); + memmove(&m_data[count], &m_data[0], s); memset(&m_data[0], value, count); } + void prepend(const void* src_data, size_t src_size) SOUP_EXCAL + { + const auto s = m_size; + grow(src_size); + memmove(&m_data[src_size], &m_data[0], s); + memcpy(&m_data[0], src_data, src_size); + } + void insert_back(size_t count, uint8_t value) SOUP_EXCAL { const auto s = m_size; diff --git a/src/vendor/Soup/soup/CryptoHashAlgo.hpp b/src/vendor/Soup/soup/CryptoHashAlgo.hpp index e2419f14e6..7e5dfa3568 100644 --- a/src/vendor/Soup/soup/CryptoHashAlgo.hpp +++ b/src/vendor/Soup/soup/CryptoHashAlgo.hpp @@ -2,6 +2,7 @@ #include "base.hpp" +#include // memset, memcpy #include NAMESPACE_SOUP @@ -30,31 +31,12 @@ NAMESPACE_SOUP return true; } - [[nodiscard]] static std::string hmac(const std::string& msg, std::string key) SOUP_EXCAL + [[nodiscard]] static std::string hmac(const std::string& msg, const std::string& key) SOUP_EXCAL { - if (key.length() > T::BLOCK_BYTES) - { - key = T::hash(key); - } - - std::string inner = key; - std::string outer = key; - - for (size_t i = 0; i != key.length(); ++i) - { - inner[i] ^= 0x36; - outer[i] ^= 0x5c; - } - - if (auto diff = T::BLOCK_BYTES - key.length(); diff != 0) - { - inner.append(diff, '\x36'); - outer.append(diff, '\x5c'); - } - - inner.append(msg); - outer.append(T::hash(std::move(inner))); - return T::hash(std::move(outer)); + HmacState st(key); + st.append(msg.data(), msg.size()); + st.finalise(); + return st.getDigest(); } // used as (secret, label, seed) in the RFC @@ -81,5 +63,64 @@ NAMESPACE_SOUP } return res; } + + struct HmacState + { + typename T::State inner; + typename T::State outer; + + HmacState(const std::string& key) noexcept + : HmacState(key.data(), key.size()) + { + } + + HmacState(const void* key_data, size_t key_size) noexcept + { + uint8_t header[T::BLOCK_BYTES]; + memset(header, 0, sizeof(header)); + + if (key_size <= T::BLOCK_BYTES) + { + memcpy(header, key_data, key_size); + } + else + { + typename T::State st; + st.append(key_data, key_size); + st.finalise(); + st.getDigest(header); static_assert(T::DIGEST_BYTES <= T::BLOCK_BYTES); + } + + for (size_t i = 0; i != sizeof(header); ++i) + { + inner.appendByte(header[i] ^ 0x36); + outer.appendByte(header[i] ^ 0x5c); + } + } + + void append(const void* data, size_t size) noexcept + { + inner.append(data, size); + } + + void finalise() noexcept + { + uint8_t buf[T::DIGEST_BYTES]; + inner.finalise(); + inner.getDigest(buf); + outer.append(buf, sizeof(buf)); + outer.finalise(); + } + + void getDigest(uint8_t out[T::DIGEST_BYTES]) const noexcept + { + return outer.getDigest(out); + } + + [[nodiscard]] std::string getDigest() const SOUP_EXCAL + { + return outer.getDigest(); + } + }; }; } diff --git a/src/vendor/Soup/soup/HardwareRng.cpp b/src/vendor/Soup/soup/HardwareRng.cpp index 63e4ece71d..25a5963e9c 100644 --- a/src/vendor/Soup/soup/HardwareRng.cpp +++ b/src/vendor/Soup/soup/HardwareRng.cpp @@ -13,17 +13,17 @@ #include // read, close #endif -#if SOUP_X86 && defined(SOUP_USE_INTRIN) -namespace soup_intrin +NAMESPACE_SOUP { - extern uint16_t hardware_rng_generate16() noexcept; - extern uint32_t hardware_rng_generate32() noexcept; - extern uint64_t hardware_rng_generate64() noexcept; -} +#if SOUP_X86 && defined(SOUP_USE_INTRIN) + namespace intrin + { + extern uint16_t hardware_rng_generate16() noexcept; + extern uint32_t hardware_rng_generate32() noexcept; + extern uint64_t hardware_rng_generate64() noexcept; + } #endif -NAMESPACE_SOUP -{ // HardwareRng bool HardwareRng::isAvailable() noexcept @@ -38,7 +38,7 @@ NAMESPACE_SOUP uint16_t HardwareRng::generate16() noexcept { #if SOUP_X86 && defined(SOUP_USE_INTRIN) - return soup_intrin::hardware_rng_generate16(); + return intrin::hardware_rng_generate16(); #else SOUP_ASSERT_UNREACHABLE; #endif @@ -47,7 +47,7 @@ NAMESPACE_SOUP uint32_t HardwareRng::generate32() noexcept { #if SOUP_X86 && defined(SOUP_USE_INTRIN) - return soup_intrin::hardware_rng_generate32(); + return intrin::hardware_rng_generate32(); #else SOUP_ASSERT_UNREACHABLE; #endif @@ -57,9 +57,9 @@ NAMESPACE_SOUP { #if SOUP_X86 && defined(SOUP_USE_INTRIN) #if SOUP_BITS >= 64 - return soup_intrin::hardware_rng_generate64(); + return intrin::hardware_rng_generate64(); #else - return (static_cast(soup_intrin::hardware_rng_generate32()) << 32) | soup_intrin::hardware_rng_generate32(); + return (static_cast(intrin::hardware_rng_generate32()) << 32) | intrin::hardware_rng_generate32(); #endif #else SOUP_ASSERT_UNREACHABLE; diff --git a/src/vendor/Soup/soup/HttpRequest.cpp b/src/vendor/Soup/soup/HttpRequest.cpp index 8480b502c6..e9c91a18f5 100644 --- a/src/vendor/Soup/soup/HttpRequest.cpp +++ b/src/vendor/Soup/soup/HttpRequest.cpp @@ -21,7 +21,7 @@ NAMESPACE_SOUP HttpRequest::HttpRequest(std::string method, std::string host, std::string path) : MimeMessage({ {ObfusString("Host"), std::move(host)}, - {ObfusString("User-Agent"), ObfusString("Mozilla/5.0 (compatible; Soup Library; +https://soup.do)")}, + {ObfusString("User-Agent"), ObfusString("Mozilla/5.0 (compatible; calamity-inc/Soup)")}, {ObfusString("Connection"), ObfusString("close")}, {ObfusString("Accept-Encoding"), ObfusString("deflate, gzip")}, }), method(std::move(method)), path(std::move(path)) diff --git a/src/vendor/Soup/soup/HttpRequestTask.cpp b/src/vendor/Soup/soup/HttpRequestTask.cpp index 82de952497..c1165164f2 100644 --- a/src/vendor/Soup/soup/HttpRequestTask.cpp +++ b/src/vendor/Soup/soup/HttpRequestTask.cpp @@ -109,8 +109,6 @@ NAMESPACE_SOUP case AWAIT_RESPONSE: if (sock->isWorkDoneOrClosed()) { - sock->close(); - sock.reset(); if (retry_on_broken_pipe) { retry_on_broken_pipe = false; @@ -120,14 +118,25 @@ NAMESPACE_SOUP else { //logWriteLine(soup::format("AWAIT_RESPONSE from {} - request failed", hr.getHost())); + if (sock->custom_data.isStructInMap(SocketCloseReason)) + { + await_response_finish_reason = sock->custom_data.getStructFromMapConst(SocketCloseReason); + } + else + { + await_response_finish_reason = netStatusToString(NET_FAIL_L7_PREMATURE_END); + } setWorkDone(); } + sock->close(); + sock.reset(); } else if (time::unixSecondsSince(awaiting_response_since) > 30) { //logWriteLine(soup::format("AWAIT_RESPONSE from {} - timeout", hr.getHost())); sock->close(); sock.reset(); + await_response_finish_reason = netStatusToString(NET_FAIL_L7_TIMEOUT); setWorkDone(); } break; @@ -155,6 +164,7 @@ NAMESPACE_SOUP { HttpRequest::recvResponse(*sock, [](Socket& s, Optional&& res, Capture&& cap) SOUP_EXCAL { + cap.get()->await_response_finish_reason = netStatusToString(NET_OK); cap.get()->fulfil(std::move(res)); if (s.custom_data.isStructInMap(ReuseTag)) { @@ -193,16 +203,16 @@ NAMESPACE_SOUP return str; } - netStatus HttpRequestTask::getStatus() const noexcept + std::string HttpRequestTask::getStatus() const SOUP_EXCAL { switch (state) { - case CONNECTING: return connector->getStatus(); - case AWAIT_RESPONSE: return isWorkDone() ? (result.has_value() ? NET_OK : NET_FAIL_L7_TIMEOUT) : NET_PENDING; + case CONNECTING: return netStatusToString(connector->getStatus()); + case AWAIT_RESPONSE: return isWorkDone() ? await_response_finish_reason : netStatusToString(NET_PENDING); default: break; // keep the compiler happy } // Assuming `!isWorkDone()` because the task can only finish during CONNECTING and AWAIT_RESPONSE. - return NET_PENDING; + return netStatusToString(NET_PENDING); } #else HttpRequestTask::HttpRequestTask(HttpRequest&& _hr) diff --git a/src/vendor/Soup/soup/HttpRequestTask.hpp b/src/vendor/Soup/soup/HttpRequestTask.hpp index 82dbda8191..9772c739ce 100644 --- a/src/vendor/Soup/soup/HttpRequestTask.hpp +++ b/src/vendor/Soup/soup/HttpRequestTask.hpp @@ -31,6 +31,7 @@ NAMESPACE_SOUP bool dont_use_reusable_sockets = false; bool dont_make_reusable_sockets = false; bool retry_on_broken_pipe = false; // internal + std::string await_response_finish_reason; // internal #endif HttpRequest hr; #if !SOUP_WASM @@ -56,7 +57,7 @@ NAMESPACE_SOUP public: [[nodiscard]] std::string toString() const SOUP_EXCAL final; - [[nodiscard]] netStatus getStatus() const noexcept; + [[nodiscard]] std::string getStatus() const SOUP_EXCAL; #else void onTick() noexcept final; diff --git a/src/vendor/Soup/soup/Reader.cpp b/src/vendor/Soup/soup/Reader.cpp index f80b933fcf..6785523295 100644 --- a/src/vendor/Soup/soup/Reader.cpp +++ b/src/vendor/Soup/soup/Reader.cpp @@ -13,10 +13,11 @@ NAMESPACE_SOUP { return false; } - bool has_next = (b >> 7); has_next &= (bits < 56); - if (has_next) + bool has_next = false; + SOUP_IF_LIKELY (bits < 56) { - b &= 0x7F; + has_next = (b >> 7); + b &= 0x7f; } v |= ((uint64_t)b << bits); if (!has_next) @@ -30,50 +31,27 @@ NAMESPACE_SOUP bool Reader::i64_dyn(int64_t& v) noexcept { - bool neg; - uint8_t b; - SOUP_IF_UNLIKELY (!u8(b)) + uint64_t u; + SOUP_IF_UNLIKELY (!u64_dyn(u)) { return false; } - uint64_t out = (b & 0b111111); - neg = ((b >> 6) & 1); - if ((b >> 7)) - { - uint8_t bits = 6; - while (true) - { - SOUP_IF_UNLIKELY (!u8(b)) - { - return false; - } - bool has_next = (b >> 7); has_next &= (bits < 56); - if (has_next) - { - b &= 0x7F; - } - out |= ((uint64_t)b << bits); - if (!has_next) - { - break; - } - bits += 7; - } - } + const bool neg = (u >> 6) & 1; // check bit 6 + u = ((u >> 1) & ~0x3f) | (u & 0x3f); // remove bit 6 if (neg) { - if (out == 0) + if (u == 0) { v = ((uint64_t)1 << 63); } else { - v = (out * -1); + v = u * -1; } } else { - v = out; + v = u; } return true; } diff --git a/src/vendor/Soup/soup/Reader.hpp b/src/vendor/Soup/soup/Reader.hpp index 0b620b1dd6..6f7785b84e 100644 --- a/src/vendor/Soup/soup/Reader.hpp +++ b/src/vendor/Soup/soup/Reader.hpp @@ -33,9 +33,10 @@ NAMESPACE_SOUP } // An unsigned 64-bit integer encoded in 1..9 bytes. The most significant bit of bytes 1 to 8 is used to indicate if another byte follows. + // Lua implementation: https://gist.github.com/Sainan/02c3ac9cea5015341412c92feec95e56 bool u64_dyn(uint64_t& v) noexcept; - // A signed 64-bit integer encoded in 1..9 bytes. + // A signed 64-bit integer encoded in 1..9 bytes. (Specialisation of u64_dyn.) bool i64_dyn(int64_t& v) noexcept; // An integer where every byte's most significant bit is used to indicate if another byte follows, most significant byte first. diff --git a/src/vendor/Soup/soup/Socket.cpp b/src/vendor/Soup/soup/Socket.cpp index d059f15f8f..8e4c4582e2 100644 --- a/src/vendor/Soup/soup/Socket.cpp +++ b/src/vendor/Soup/soup/Socket.cpp @@ -20,6 +20,7 @@ #include "Exception.hpp" #include "NamedCurves.hpp" #include "netConfig.hpp" +#include "ObfusString.hpp" #include "rand.hpp" #include "sha1.hpp" #include "sha256.hpp" @@ -55,7 +56,7 @@ NAMESPACE_SOUP #endif Socket::Socket() noexcept - : Worker(true) + : Worker(WORKER_TYPE_SOCKET) { onConstruct(); } @@ -1101,13 +1102,13 @@ NAMESPACE_SOUP return tls_encrypter_send.isActive(); } - bool Socket::send(const std::string& data) SOUP_EXCAL + bool Socket::send(const void* data, size_t size) SOUP_EXCAL { if (tls_encrypter_send.isActive()) { - return tls_sendRecordEncrypted(TlsContentType::application_data, data); + return tls_sendRecordEncrypted(TlsContentType::application_data, data, size); } - return transport_send(data); + return transport_send(data, static_cast(size)); } bool Socket::initUdpBroadcast4() @@ -1271,19 +1272,23 @@ NAMESPACE_SOUP bool Socket::tls_sendRecordEncrypted(TlsContentType_t content_type, const std::string& content) SOUP_EXCAL { - auto body = tls_encrypter_send.encrypt(content_type, content); + return tls_sendRecordEncrypted(content_type, content.data(), content.size()); + } + + bool Socket::tls_sendRecordEncrypted(TlsContentType_t content_type, const void* data, size_t size) SOUP_EXCAL + { + auto body = tls_encrypter_send.encrypt(content_type, data, size); TlsRecord record{}; record.content_type = content_type; record.length = static_cast(body.size()); - Buffer buf(5 + body.size()); - BufferRefWriter bw(buf, ENDIAN_BIG); + Buffer header(5); + BufferRefWriter bw(header, ENDIAN_BIG); record.write(bw); - buf.append(body.data(), body.size()); - - return transport_send(buf); + body.prepend(header.data(), header.size()); + return transport_send(body); } struct CaptureSocketTlsRecvHandshake @@ -1291,7 +1296,6 @@ NAMESPACE_SOUP UniquePtr handshaker; void(*callback)(Socket&, UniquePtr&&, TlsHandshakeType_t, std::string&&) SOUP_EXCAL; std::string pre; - bool is_new_bytes = false; }; void Socket::tls_recvHandshake(UniquePtr&& handshaker, void(*callback)(Socket&, UniquePtr&&, TlsHandshakeType_t, std::string&&) SOUP_EXCAL, std::string&& pre) SOUP_EXCAL @@ -1306,9 +1310,10 @@ NAMESPACE_SOUP { if (content_type != TlsContentType::handshake) { -#if LOGGING if (content_type == TlsContentType::alert) { + s.custom_data.getStructFromMap(SocketCloseReason) = tls_alertToCloseReason(data); +#if LOGGING std::string msg = s.toString(); msg.append(" - Remote closing connection with "); if (data.at(0) == 2) @@ -1319,25 +1324,23 @@ NAMESPACE_SOUP msg.append(std::to_string((int)data.at(1))); msg.append(". See TlsAlertDescription for details."); logWriteLine(std::move(msg)); +#endif } else { - std::string msg = "Unexpected content type; expected handshake, found "; + std::string msg = ObfusString("Unexpected content type during handshake: ").str(); msg.append(std::to_string((int)content_type)); - logWriteLine(std::move(msg)); - } +#if LOGGING + logWriteLine(msg); #endif + s.custom_data.getStructFromMap(SocketCloseReason) = std::move(msg); + } s.tls_close(TlsAlertDescription::unexpected_message); return; } auto& cap = _cap.get(); - if (cap.is_new_bytes) - { - cap.handshaker->layer_bytes.append(data); - } - if (!cap.pre.empty()) { data.insert(0, cap.pre); @@ -1356,26 +1359,26 @@ NAMESPACE_SOUP return; } + cap.handshaker->layer_bytes.append(data.substr(0, 4)); data.erase(0, 4); if (data.size() > hs.length) { - s.tls_record_buf = data.substr(hs.length); + s.transport_unrecv(data.substr(hs.length)); + + TlsRecord record{}; + record.content_type = TlsContentType::handshake; + record.length = static_cast(data.size() - hs.length); + s.transport_unrecv(record.toBinaryString()); + data.erase(hs.length); } + cap.handshaker->layer_bytes.append(data); + cap.callback(s, std::move(cap.handshaker), hs.handshake_type, std::move(data)); }; - if (!tls_record_buf.empty()) - { - std::string data = std::move(tls_record_buf); - tls_record_buf.clear(); - record_callback(*this, TlsContentType::handshake, std::move(data), std::move(cap)); - return; - } - - cap.is_new_bytes = true; tls_recvRecord(record_callback, std::move(cap)); } @@ -1397,6 +1400,7 @@ NAMESPACE_SOUP } else if (content_type == TlsContentType::alert) { + s.custom_data.getStructFromMap(SocketCloseReason) = tls_alertToCloseReason(data); #if LOGGING { std::string msg = s.toString(); @@ -1443,13 +1447,6 @@ NAMESPACE_SOUP void Socket::tls_recvRecord(void(*callback)(Socket&, TlsContentType_t, std::string&&, Capture&&), Capture&& cap) { - if (!tls_record_buf.empty()) - { - std::string data = std::move(tls_record_buf); - tls_record_buf.clear(); - callback(*this, TlsContentType::handshake, std::move(data), std::move(cap)); - return; - } transport_recvExact(5, [](Socket& s, std::string&& data, Capture&& cap) SOUP_EXCAL { TlsRecord record{}; @@ -1543,7 +1540,7 @@ NAMESPACE_SOUP iv.insert(iv.end(), nonce_explicit.begin(), nonce_explicit.end()); data.erase(0, record_iv_length); - auto ad = s.tls_encrypter_recv.calculateMacBytes(cap.content_type, data); + auto ad = s.tls_encrypter_recv.calculateMacBytes(cap.content_type, data.size()); if (aes::gcmDecrypt( (uint8_t*)data.data(), data.size(), @@ -1586,6 +1583,18 @@ NAMESPACE_SOUP } } + std::string Socket::tls_alertToCloseReason(const std::string& data) + { + std::string msg = ObfusString("Remote closing connection with ").str(); + if (data.at(0) == 2) + { + msg.append(ObfusString("fatal ").str()); + } + msg.append(ObfusString("alert: ").str()); + msg.append(std::to_string((int)data.at(1))); + return msg; + } + bool Socket::transport_hasData() const { char buf; @@ -1609,6 +1618,12 @@ NAMESPACE_SOUP std::string Socket::transport_recvCommon(int max_bytes) SOUP_EXCAL { + if (!unrecv_buf.empty()) + { + std::string ret = unrecv_buf.substr(0, max_bytes); + unrecv_buf.erase(0, max_bytes); + return ret; + } std::string buf(max_bytes, '\0'); auto res = ::recv(fd, buf.data(), max_bytes, 0); if (res > 0) @@ -1698,6 +1713,11 @@ NAMESPACE_SOUP }, CaptureSocketTransportRecvExact(bytes, callback, std::move(cap), std::move(pre))); } + void Socket::transport_unrecv(const std::string& data) SOUP_EXCAL + { + unrecv_buf.insert(0, data); + } + void Socket::transport_close() noexcept { if (hasConnection()) diff --git a/src/vendor/Soup/soup/Socket.hpp b/src/vendor/Soup/soup/Socket.hpp index 7bea5a11c2..5a4e00f5a5 100644 --- a/src/vendor/Soup/soup/Socket.hpp +++ b/src/vendor/Soup/soup/Socket.hpp @@ -43,7 +43,7 @@ NAMESPACE_SOUP bool dispatched_connection_lost = false; bool callback_recv_on_close = false; - std::string tls_record_buf{}; + std::string unrecv_buf{}; SocketTlsEncrypter tls_encrypter_send; SocketTlsEncrypter tls_encrypter_recv; @@ -126,7 +126,8 @@ NAMESPACE_SOUP [[nodiscard]] bool isEncrypted() const noexcept; - bool send(const std::string& data) SOUP_EXCAL; + bool send(const std::string& data) SOUP_EXCAL { return send(data.data(), data.size()); } + bool send(const void* data, size_t size) SOUP_EXCAL; bool initUdpBroadcast4(); @@ -166,6 +167,7 @@ NAMESPACE_SOUP bool tls_sendHandshake(const UniquePtr& handshaker, TlsHandshakeType_t handshake_type, const std::string& content) SOUP_EXCAL; bool tls_sendRecord(TlsContentType_t content_type, const std::string& content) SOUP_EXCAL; bool tls_sendRecordEncrypted(TlsContentType_t content_type, const std::string& content) SOUP_EXCAL; + bool tls_sendRecordEncrypted(TlsContentType_t content_type, const void* data, size_t size) SOUP_EXCAL; void tls_recvHandshake(UniquePtr&& handshaker, void(*callback)(Socket&, UniquePtr&&, TlsHandshakeType_t, std::string&&) SOUP_EXCAL, std::string&& pre = {}) SOUP_EXCAL; void tls_recvRecord(TlsContentType_t expected_content_type, void(*callback)(Socket&, std::string&&, Capture&&), Capture&& cap = {}); // 'excal' as long as callback is @@ -173,6 +175,8 @@ NAMESPACE_SOUP void tls_close(TlsAlertDescription_t desc) SOUP_EXCAL; + [[nodiscard]] static std::string tls_alertToCloseReason(const std::string& data); + // Transport Layer bool transport_send(const Buffer& buf) const noexcept; @@ -189,6 +193,8 @@ NAMESPACE_SOUP void transport_recv(int max_bytes, transport_recv_callback_t callback, Capture&& cap = {}); // 'excal' as long as callback is void transport_recvExact(int bytes, transport_recv_callback_t callback, Capture&& cap = {}, std::string&& pre = {}); // 'excal' as long as callback is + void transport_unrecv(const std::string& data) SOUP_EXCAL; + void transport_close() noexcept; // Utils diff --git a/src/vendor/Soup/soup/SocketTlsEncrypter.cpp b/src/vendor/Soup/soup/SocketTlsEncrypter.cpp index e74dfa3ee1..96ec0950a7 100644 --- a/src/vendor/Soup/soup/SocketTlsEncrypter.cpp +++ b/src/vendor/Soup/soup/SocketTlsEncrypter.cpp @@ -13,31 +13,37 @@ NAMESPACE_SOUP return mac_key.size(); } - std::string SocketTlsEncrypter::calculateMacBytes(TlsContentType_t content_type, const std::string& content) SOUP_EXCAL + std::string SocketTlsEncrypter::calculateMacBytes(TlsContentType_t content_type, size_t content_length) SOUP_EXCAL { TlsMac mac{}; mac.seq_num = seq_num++; mac.record.content_type = content_type; - mac.record.length = static_cast(content.size()); + mac.record.length = static_cast(content_length); return mac.toBinaryString(); } - std::string SocketTlsEncrypter::calculateMac(TlsContentType_t content_type, const std::string& content) SOUP_EXCAL + std::string SocketTlsEncrypter::calculateMac(TlsContentType_t content_type, const void* data, size_t size) SOUP_EXCAL { - auto msg = calculateMacBytes(content_type, content); - msg.append(content); - + auto msg = calculateMacBytes(content_type, size); if (mac_key.size() == 20) { - return sha1::hmac(msg, mac_key); + sha1::HmacState st(mac_key); + st.append(msg.data(), msg.size()); + st.append(data, size); + st.finalise(); + return st.getDigest(); } //else if (mac_key.size() == 32) { - return sha256::hmac(msg, mac_key); + sha256::HmacState st(mac_key); + st.append(msg.data(), msg.size()); + st.append(data, size); + st.finalise(); + return st.getDigest(); } } - std::vector SocketTlsEncrypter::encrypt(TlsContentType_t content_type, const std::string& content) SOUP_EXCAL + Buffer SocketTlsEncrypter::encrypt(TlsContentType_t content_type, const void* data, size_t size) SOUP_EXCAL { constexpr auto cipher_bytes = 16; @@ -45,26 +51,25 @@ NAMESPACE_SOUP { constexpr auto record_iv_length = 16; - auto mac = calculateMac(content_type, content); - auto cont_with_mac_size = (content.size() + mac.size()); + auto mac = calculateMac(content_type, data, size); + auto cont_with_mac_size = (size + mac.size()); auto aligned_in_len = ((((cont_with_mac_size + 1) / cipher_bytes) + 1) * cipher_bytes); auto pad_len = static_cast(aligned_in_len - cont_with_mac_size); - std::vector data{}; - data.reserve(content.size() + mac.size() + pad_len); - data.insert(data.end(), content.begin(), content.end()); - data.insert(data.end(), mac.begin(), mac.end()); - data.insert(data.end(), (size_t)pad_len, (pad_len - 1)); + Buffer buf(size + mac.size() + pad_len); + buf.append(data, size); + buf.append(mac.data(), mac.size()); + buf.insert_back((size_t)pad_len, (pad_len - 1)); auto iv = rand.vec_u8(record_iv_length); aes::cbcEncrypt( - data.data(), data.size(), + buf.data(), buf.size(), cipher_key.data(), cipher_key.size(), iv.data() ); - iv.insert(iv.end(), data.begin(), data.end()); - return iv; + buf.prepend(iv.data(), iv.size()); + return buf; } else // AES-GCM { @@ -74,22 +79,23 @@ NAMESPACE_SOUP auto iv = implicit_iv; iv.insert(iv.end(), nonce_explicit.begin(), nonce_explicit.end()); - auto ad = calculateMacBytes(content_type, content); + auto ad = calculateMacBytes(content_type, size); - std::vector data(content.begin(), content.end()); + Buffer buf(size + cipher_bytes + nonce_explicit.size()); + buf.append(data, size); uint8_t tag[cipher_bytes]; aes::gcmEncrypt( - data.data(), data.size(), + buf.data(), buf.size(), (const uint8_t*)ad.data(), ad.size(), cipher_key.data(), cipher_key.size(), iv.data(), iv.size(), tag ); - data.insert(data.end(), tag, tag + cipher_bytes); - data.insert(data.begin(), nonce_explicit.begin(), nonce_explicit.end()); - return data; + buf.append(tag, cipher_bytes); + buf.prepend(nonce_explicit.data(), nonce_explicit.size()); + return buf; } } diff --git a/src/vendor/Soup/soup/SocketTlsEncrypter.hpp b/src/vendor/Soup/soup/SocketTlsEncrypter.hpp index 28bba1a38f..9549b25379 100644 --- a/src/vendor/Soup/soup/SocketTlsEncrypter.hpp +++ b/src/vendor/Soup/soup/SocketTlsEncrypter.hpp @@ -6,6 +6,8 @@ #include "base.hpp" #include "type.hpp" +#include "Buffer.hpp" + NAMESPACE_SOUP { struct SocketTlsEncrypter @@ -26,10 +28,11 @@ NAMESPACE_SOUP } [[nodiscard]] size_t getMacLength() const noexcept; - [[nodiscard]] std::string calculateMacBytes(TlsContentType_t content_type, const std::string& content) SOUP_EXCAL; - [[nodiscard]] std::string calculateMac(TlsContentType_t content_type, const std::string& content) SOUP_EXCAL; + [[nodiscard]] std::string calculateMacBytes(TlsContentType_t content_type, size_t content_length) SOUP_EXCAL; + [[nodiscard]] std::string calculateMac(TlsContentType_t content_type, const std::string& content) SOUP_EXCAL { return calculateMac(content_type, content.data(), content.size()); } + [[nodiscard]] std::string calculateMac(TlsContentType_t content_type, const void* data, size_t size) SOUP_EXCAL; - [[nodiscard]] std::vector encrypt(TlsContentType_t content_type, const std::string& content) SOUP_EXCAL; + [[nodiscard]] Buffer encrypt(TlsContentType_t content_type, const void* data, size_t size) SOUP_EXCAL; void reset() noexcept; }; diff --git a/src/vendor/Soup/soup/Uri.cpp b/src/vendor/Soup/soup/Uri.cpp index 5419f61edd..85fff0df34 100644 --- a/src/vendor/Soup/soup/Uri.cpp +++ b/src/vendor/Soup/soup/Uri.cpp @@ -31,51 +31,51 @@ NAMESPACE_SOUP if (uri.length() > 2 && uri.substr(0, 2) == "//") { uri.erase(0, 2); + } - size_t authority_ends = uri.find('/'); + size_t authority_ends = uri.find('/'); + if (authority_ends == std::string::npos) + { + authority_ends = uri.find('?'); if (authority_ends == std::string::npos) { - authority_ends = uri.find('?'); - if (authority_ends == std::string::npos) - { - authority_ends = uri.find('#'); - } - } - - auto userinfo_sep = uri.find('@'); - if (userinfo_sep < authority_ends) - { - auto pass_sep = uri.find(':'); - if (pass_sep < authority_ends) - { - user = uri.substr(0, pass_sep); - pass = uri.substr((pass_sep + 1), userinfo_sep - (pass_sep + 1)); - } - else - { - user = uri.substr(0, userinfo_sep); - } - - uri.erase(0, userinfo_sep + 1); + authority_ends = uri.find('#'); } + } - auto port_sep = uri.find(':'); - if (port_sep < authority_ends) + auto userinfo_sep = uri.find('@'); + if (userinfo_sep < authority_ends) + { + auto pass_sep = uri.find(':'); + if (pass_sep < authority_ends) { - host = uri.substr(0, port_sep); - const char* pPort = &uri.at(port_sep + 1); - const char* i = pPort; - port = string::toIntImpl(i); - ++i; - - uri.erase(0, port_sep + (i - pPort)); + user = uri.substr(0, pass_sep); + pass = uri.substr((pass_sep + 1), userinfo_sep - (pass_sep + 1)); } else { - host = uri.substr(0, authority_ends); - - uri.erase(0, authority_ends); + user = uri.substr(0, userinfo_sep); } + + uri.erase(0, userinfo_sep + 1); + } + + auto port_sep = uri.find(':'); + if (port_sep < authority_ends) + { + host = uri.substr(0, port_sep); + const char* pPort = &uri.at(port_sep + 1); + const char* i = pPort; + port = string::toIntImpl(i); + ++i; + + uri.erase(0, port_sep + (i - pPort)); + } + else + { + host = uri.substr(0, authority_ends); + + uri.erase(0, authority_ends); } auto query_sep = uri.find('?'); @@ -114,12 +114,7 @@ NAMESPACE_SOUP { path = std::move(uri); } - query = urlenc::decode(query); } - - path = urlenc::decode(path); - query = urlenc::decode(query); - fragment = urlenc::decode(fragment); } std::string Uri::toString() const SOUP_EXCAL @@ -161,11 +156,11 @@ NAMESPACE_SOUP std::string Uri::getRequestPath() const SOUP_EXCAL { - auto str = urlenc::encodePath(path); + auto str = path; if (!query.empty()) { str.push_back('?'); - str.append(urlenc::encodePathWithQuery(query)); + str.append(query); } return str; } diff --git a/src/vendor/Soup/soup/Writer.cpp b/src/vendor/Soup/soup/Writer.cpp index 63124db030..1dcaeca12f 100644 --- a/src/vendor/Soup/soup/Writer.cpp +++ b/src/vendor/Soup/soup/Writer.cpp @@ -7,7 +7,7 @@ NAMESPACE_SOUP uint64_t in = v; for (uint8_t i = 0; i != 8; ++i) { - uint8_t cur = (in & 0x7F); + uint8_t cur = (in & 0x7f); in >>= 7; if (in != 0) { @@ -29,58 +29,17 @@ NAMESPACE_SOUP bool Writer::i64_dyn(const int64_t& v) noexcept { - // Split value - uint64_t in; + uint64_t u; bool neg = (v < 0); if (neg) { - in = (v * -1); - in &= ~((uint64_t)1 << 63); + u = (v * -1) & ~((uint64_t)1 << 63); } else { - in = v; + u = v; } - - // First byte - { - uint8_t cur = (in & 0b111111); - cur |= (neg << 6); - in >>= 6; - if (in != 0) - { - cur |= 0x80; - u8(cur); - } - else - { - return u8(cur); - } - } - - // Next 1..7 bytes - for (uint8_t i = 0; i != 7; ++i) - { - uint8_t cur = (in & 0x7F); - in >>= 7; - if (in != 0) - { - cur |= 0x80; - u8(cur); - } - else - { - return u8(cur); - } - } - - // Optional last byte - if (in != 0) - { - auto byte = (uint8_t)in; - return u8(byte); - } - return true; + return u64_dyn(((uint64_t)neg << 6) | ((u & ~0x3f) << 1) | (u & 0x3f)); } bool Writer::mysql_lenenc(const uint64_t& v) noexcept diff --git a/src/vendor/Soup/soup/Writer.hpp b/src/vendor/Soup/soup/Writer.hpp index 3012761088..830e3a80f5 100644 --- a/src/vendor/Soup/soup/Writer.hpp +++ b/src/vendor/Soup/soup/Writer.hpp @@ -22,9 +22,10 @@ NAMESPACE_SOUP } // An unsigned 64-bit integer encoded in 1..9 bytes. The most significant bit of bytes 1 to 8 is used to indicate if another byte follows. + // Lua implementation: https://gist.github.com/Sainan/02c3ac9cea5015341412c92feec95e56 bool u64_dyn(const uint64_t& v) noexcept; - // A signed 64-bit integer encoded in 1..9 bytes. + // A signed 64-bit integer encoded in 1..9 bytes. (Specialisation of u64_dyn.) bool i64_dyn(const int64_t& v) noexcept; // An integer where every byte's most significant bit is used to indicate if another byte follows, most significant byte first. diff --git a/src/vendor/Soup/soup/aes.cpp b/src/vendor/Soup/soup/aes.cpp index c92684ef6c..c29fb8a144 100644 --- a/src/vendor/Soup/soup/aes.cpp +++ b/src/vendor/Soup/soup/aes.cpp @@ -42,24 +42,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#if AES_USE_INTRIN -namespace soup_intrin -{ - extern void aes_expand_key_128(uint8_t w[176], const uint8_t key[16]) noexcept; - extern void aes_expand_key_192(uint8_t w[208], const uint8_t key[24]) noexcept; - extern void aes_expand_key_256(uint8_t w[240], const uint8_t key[32]) noexcept; - extern void aes_encrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept; - extern void aes_encrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept; - extern void aes_encrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept; - extern void aes_prepare_decryption_128(uint8_t w[176]) noexcept; - extern void aes_prepare_decryption_192(uint8_t w[208]) noexcept; - extern void aes_prepare_decryption_256(uint8_t w[240]) noexcept; - extern void aes_decrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept; - extern void aes_decrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept; - extern void aes_decrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept; -} -#endif - #if SOUP_X86 #define IS_AES_INTRIN_AVAILBLE CpuInfo::get().supportsAESNI() #else @@ -68,6 +50,24 @@ namespace soup_intrin NAMESPACE_SOUP { +#if AES_USE_INTRIN + namespace intrin + { + extern void aes_expand_key_128(uint8_t w[176], const uint8_t key[16]) noexcept; + extern void aes_expand_key_192(uint8_t w[208], const uint8_t key[24]) noexcept; + extern void aes_expand_key_256(uint8_t w[240], const uint8_t key[32]) noexcept; + extern void aes_encrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept; + extern void aes_encrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept; + extern void aes_encrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept; + extern void aes_prepare_decryption_128(uint8_t w[176]) noexcept; + extern void aes_prepare_decryption_192(uint8_t w[208]) noexcept; + extern void aes_prepare_decryption_256(uint8_t w[240]) noexcept; + extern void aes_decrypt_block_128(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[176]) noexcept; + extern void aes_decrypt_block_192(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[208]) noexcept; + extern void aes_decrypt_block_256(const uint8_t in[16], uint8_t out[16], const uint8_t roundKeys[240]) noexcept; + } +#endif + static constexpr int Nb = 4; static constexpr unsigned int blockBytesLen = 4 * Nb * sizeof(uint8_t); @@ -365,28 +365,28 @@ NAMESPACE_SOUP { if (key_len == 16) { - soup_intrin::aes_expand_key_128(roundKeys, key); + intrin::aes_expand_key_128(roundKeys, key); for (size_t i = 0; i != data_len; i += blockBytesLen) { - soup_intrin::aes_encrypt_block_128(&data[i], &data[i], roundKeys); + intrin::aes_encrypt_block_128(&data[i], &data[i], roundKeys); } return; } else if (key_len == 24) { - soup_intrin::aes_expand_key_192(roundKeys, key); + intrin::aes_expand_key_192(roundKeys, key); for (size_t i = 0; i != data_len; i += blockBytesLen) { - soup_intrin::aes_encrypt_block_192(&data[i], &data[i], roundKeys); + intrin::aes_encrypt_block_192(&data[i], &data[i], roundKeys); } return; } else if (key_len == 32) { - soup_intrin::aes_expand_key_256(roundKeys, key); + intrin::aes_expand_key_256(roundKeys, key); for (size_t i = 0; i != data_len; i += blockBytesLen) { - soup_intrin::aes_encrypt_block_256(&data[i], &data[i], roundKeys); + intrin::aes_encrypt_block_256(&data[i], &data[i], roundKeys); } return; } @@ -412,31 +412,31 @@ NAMESPACE_SOUP { if (key_len == 16) { - soup_intrin::aes_expand_key_128(roundKeys, key); - soup_intrin::aes_prepare_decryption_128(roundKeys); + intrin::aes_expand_key_128(roundKeys, key); + intrin::aes_prepare_decryption_128(roundKeys); for (size_t i = 0; i != data_len; i += blockBytesLen) { - soup_intrin::aes_decrypt_block_128(&data[i], &data[i], roundKeys); + intrin::aes_decrypt_block_128(&data[i], &data[i], roundKeys); } return; } else if (key_len == 24) { - soup_intrin::aes_expand_key_192(roundKeys, key); - soup_intrin::aes_prepare_decryption_192(roundKeys); + intrin::aes_expand_key_192(roundKeys, key); + intrin::aes_prepare_decryption_192(roundKeys); for (size_t i = 0; i != data_len; i += blockBytesLen) { - soup_intrin::aes_decrypt_block_192(&data[i], &data[i], roundKeys); + intrin::aes_decrypt_block_192(&data[i], &data[i], roundKeys); } return; } else if (key_len == 32) { - soup_intrin::aes_expand_key_256(roundKeys, key); - soup_intrin::aes_prepare_decryption_256(roundKeys); + intrin::aes_expand_key_256(roundKeys, key); + intrin::aes_prepare_decryption_256(roundKeys); for (size_t i = 0; i != data_len; i += blockBytesLen) { - soup_intrin::aes_decrypt_block_256(&data[i], &data[i], roundKeys); + intrin::aes_decrypt_block_256(&data[i], &data[i], roundKeys); } return; } @@ -513,15 +513,15 @@ NAMESPACE_SOUP { if (Nr == 10) { - return soup_intrin::aes_encrypt_block_128(in, out, roundKeys); + return intrin::aes_encrypt_block_128(in, out, roundKeys); } else if (Nr == 12) { - return soup_intrin::aes_encrypt_block_192(in, out, roundKeys); + return intrin::aes_encrypt_block_192(in, out, roundKeys); } else if (Nr == 14) { - return soup_intrin::aes_encrypt_block_256(in, out, roundKeys); + return intrin::aes_encrypt_block_256(in, out, roundKeys); } } #endif @@ -574,15 +574,15 @@ NAMESPACE_SOUP { if (Nr == 10) { - return soup_intrin::aes_decrypt_block_128(in, out, roundKeys); + return intrin::aes_decrypt_block_128(in, out, roundKeys); } else if (Nr == 12) { - return soup_intrin::aes_decrypt_block_192(in, out, roundKeys); + return intrin::aes_decrypt_block_192(in, out, roundKeys); } else if (Nr == 14) { - return soup_intrin::aes_decrypt_block_256(in, out, roundKeys); + return intrin::aes_decrypt_block_256(in, out, roundKeys); } } #endif @@ -634,15 +634,15 @@ NAMESPACE_SOUP { if (key_len == 16) { - return soup_intrin::aes_expand_key_128(w, key); + return intrin::aes_expand_key_128(w, key); } else if (key_len == 24) { - return soup_intrin::aes_expand_key_192(w, key); + return intrin::aes_expand_key_192(w, key); } else if (key_len == 32) { - return soup_intrin::aes_expand_key_256(w, key); + return intrin::aes_expand_key_256(w, key); } } #endif @@ -700,15 +700,15 @@ NAMESPACE_SOUP { if (key_len == 16) { - soup_intrin::aes_prepare_decryption_128(w); + intrin::aes_prepare_decryption_128(w); } else if (key_len == 24) { - return soup_intrin::aes_prepare_decryption_192(w); + return intrin::aes_prepare_decryption_192(w); } else if (key_len == 32) { - return soup_intrin::aes_prepare_decryption_256(w); + return intrin::aes_prepare_decryption_256(w); } } #endif diff --git a/src/vendor/Soup/soup/crc32.cpp b/src/vendor/Soup/soup/crc32.cpp index 4538be1079..074fa91c7e 100644 --- a/src/vendor/Soup/soup/crc32.cpp +++ b/src/vendor/Soup/soup/crc32.cpp @@ -17,16 +17,16 @@ // Original source: https://github.com/richgel999/fpng/blob/main/src/fpng.cpp // Original licence: Dedicated to the public domain. -#if CRC32_USE_INTRIN -namespace soup_intrin +NAMESPACE_SOUP { - extern uint32_t crc32_pclmul(const uint8_t* p, size_t size, uint32_t crc) noexcept; - extern uint32_t crc32_armv8(const uint8_t* p, size_t size, uint32_t crc) noexcept; -} +#if CRC32_USE_INTRIN + namespace intrin + { + extern uint32_t crc32_pclmul(const uint8_t* p, size_t size, uint32_t crc) noexcept; + extern uint32_t crc32_armv8(const uint8_t* p, size_t size, uint32_t crc) noexcept; + } #endif -NAMESPACE_SOUP -{ static const uint32_t crc32_lookup4[4][256] = { {00, 016701630226, 035603460454, 023102250672, 0733342031, 016032572217, 035130722465, 023631112643, 01666704062, 017167134244, 034065364436, 022764554610, 01155446053, 017654276275, 034756026407, 022057616621, 03555610144, 015254020362, 036356270510, 020457440736, 03266552175, 015567362353, 036465132521, 020364702707, 02333114126, 014432724300, 037530574572, 021231344754, 02400256117, 014301466331, 037203636543, 021502006765, 07333420310, 011432210136, 032530040744, 024231670562, 07400762321, 011301152107, 032203302775, 024502532553, 06555324372, 010254514154, 033356744726, 025457174500, 06266066343, 010567656165, 033465406717, 025364236531, 04666230254, 012167400072, 031065650600, 027764060426, 04155172265, 012654742043, 031756512631, 027057322417, 05000534236, 013701304010, 030603154662, 026102764444, 05733676207, 013032046021, 030130216653, 026631426475, @@ -112,7 +112,7 @@ NAMESPACE_SOUP } uint32_t simd_len = size & ~15; - uint32_t c = soup_intrin::crc32_pclmul(data, simd_len, init); + uint32_t c = intrin::crc32_pclmul(data, simd_len, init); return crc32_slice_by_4(data + simd_len, size - simd_len, c); } #endif @@ -131,7 +131,7 @@ NAMESPACE_SOUP #elif SOUP_ARM if (CpuInfo::get().armv8_crc32) { - return soup_intrin::crc32_armv8(data, size, init); + return intrin::crc32_armv8(data, size, init); } #endif #endif diff --git a/src/vendor/Soup/soup/deflate.cpp b/src/vendor/Soup/soup/deflate.cpp index bb310d3f11..61c3e1180a 100644 --- a/src/vendor/Soup/soup/deflate.cpp +++ b/src/vendor/Soup/soup/deflate.cpp @@ -505,7 +505,7 @@ NAMESPACE_SOUP }; - unsigned int copyStored(DeflateBitReader& bit_reader, unsigned char* out, size_t out_offset, uint16_t block_size_max) + unsigned int copyStored(DeflateBitReader& bit_reader, unsigned char* out, size_t out_offset, size_t block_size_max) { SOUP_IF_UNLIKELY (!bit_reader.alignToByte()) { @@ -531,7 +531,6 @@ NAMESPACE_SOUP SOUP_IF_UNLIKELY (stored_length > block_size_max) { return -1; - //stored_length = block_size_max; } memcpy(out + out_offset, bit_reader.getInBlock(), stored_length); @@ -900,7 +899,7 @@ NAMESPACE_SOUP DeflateBitReader br(current_compressed_data, end_compressed_data); res.decompressed = std::string(max_decompressed_size, '\0'); - auto out = reinterpret_cast(&res.decompressed.at(0)); + auto out = reinterpret_cast(&res.decompressed[0]); size_t current_out_offset = 0; while (true) { @@ -911,7 +910,7 @@ NAMESPACE_SOUP switch (block_type) { case 0: - block_result = copyStored(br, out, current_out_offset, static_cast(max_decompressed_size - current_out_offset)); + block_result = copyStored(br, out, current_out_offset, max_decompressed_size - current_out_offset); break; case 1: @@ -998,6 +997,8 @@ NAMESPACE_SOUP break; } + res.compressed_size = (current_compressed_data - (unsigned char*)compressed_data); + return res; } } diff --git a/src/vendor/Soup/soup/deflate.hpp b/src/vendor/Soup/soup/deflate.hpp index d3068ec954..23a662f171 100644 --- a/src/vendor/Soup/soup/deflate.hpp +++ b/src/vendor/Soup/soup/deflate.hpp @@ -12,6 +12,7 @@ NAMESPACE_SOUP struct DecompressResult { std::string decompressed{}; + size_t compressed_size = 0; bool checksum_present = false; bool checksum_mismatch = false; }; diff --git a/src/vendor/Soup/soup/filesystem.cpp b/src/vendor/Soup/soup/filesystem.cpp index f8abcaf44a..aec877a20c 100644 --- a/src/vendor/Soup/soup/filesystem.cpp +++ b/src/vendor/Soup/soup/filesystem.cpp @@ -47,6 +47,15 @@ NAMESPACE_SOUP return static_cast(in.tellg()); } + bool filesystem::replace(const std::filesystem::path& replaced, const std::filesystem::path& replacement) + { +#if SOUP_WINDOWS + return ReplaceFileW(replaced.c_str(), replacement.c_str(), nullptr, 0, 0, 0) != 0; +#else + return rename(replacement.c_str(), replaced.c_str()) == 0; +#endif + } + std::filesystem::path filesystem::tempfile(const std::string& ext) { std::filesystem::path path; diff --git a/src/vendor/Soup/soup/filesystem.hpp b/src/vendor/Soup/soup/filesystem.hpp index 9716e3a894..f9f50e94d0 100644 --- a/src/vendor/Soup/soup/filesystem.hpp +++ b/src/vendor/Soup/soup/filesystem.hpp @@ -14,6 +14,8 @@ NAMESPACE_SOUP [[nodiscard]] static bool exists_case_sensitive(const std::filesystem::path& p); [[nodiscard]] static intptr_t filesize(const std::filesystem::path& path); // returns -1 on error + static bool replace(const std::filesystem::path& replaced, const std::filesystem::path& replacement); + [[nodiscard]] static std::filesystem::path tempfile(const std::string& ext = {}); [[nodiscard]] static std::filesystem::path getProgramData() noexcept; diff --git a/src/vendor/Soup/soup/fwd.hpp b/src/vendor/Soup/soup/fwd.hpp index f29136a75f..f93c25bfa6 100644 --- a/src/vendor/Soup/soup/fwd.hpp +++ b/src/vendor/Soup/soup/fwd.hpp @@ -172,14 +172,12 @@ NAMESPACE_SOUP // os enum ControlInput : uint8_t; + struct HandleRaii; class Module; enum MouseButton : uint8_t; class Thread; struct Window; - // os.windows - struct HandleRaii; - // task class Capture; class DetachedScheduler; diff --git a/src/vendor/Soup/soup/netStatus.cpp b/src/vendor/Soup/soup/netStatus.cpp index c6878af910..96071a9024 100644 --- a/src/vendor/Soup/soup/netStatus.cpp +++ b/src/vendor/Soup/soup/netStatus.cpp @@ -4,19 +4,17 @@ NAMESPACE_SOUP { const char* netStatusToString(netStatus status) { - if (status != NET_PENDING) + switch (status) { - switch (status) - { - case NET_PENDING: break; // keep the compiler happy - case NET_OK: return "OK"; - case NET_FAIL_NO_DNS_RESPONSE: return "DNS Query Got No Response"; - case NET_FAIL_NO_DNS_RESULTS: return "DNS Query Yielded No Results"; - case NET_FAIL_L4_TIMEOUT: return "TCP Handshake Timed Out"; - case NET_FAIL_L4_ERROR: return "TCP Handshake Failed"; - case NET_FAIL_L7_TIMEOUT: return "Request Timed Out"; - } + case NET_PENDING: return "Pending"; + case NET_OK: return "OK"; + case NET_FAIL_NO_DNS_RESPONSE: return "DNS Query Got No Response"; + case NET_FAIL_NO_DNS_RESULTS: return "DNS Query Yielded No Results"; + case NET_FAIL_L4_TIMEOUT: return "TCP Handshake Timed Out"; + case NET_FAIL_L4_ERROR: return "TCP Handshake Failed"; + case NET_FAIL_L7_TIMEOUT: return "Request Timed Out"; + case NET_FAIL_L7_PREMATURE_END: return "Connection Closed Prematurely"; } - return "Pending"; + SOUP_UNREACHABLE; } } diff --git a/src/vendor/Soup/soup/netStatus.hpp b/src/vendor/Soup/soup/netStatus.hpp index c1b61c6280..5f2b1adf62 100644 --- a/src/vendor/Soup/soup/netStatus.hpp +++ b/src/vendor/Soup/soup/netStatus.hpp @@ -15,6 +15,7 @@ NAMESPACE_SOUP NET_FAIL_L4_TIMEOUT, NET_FAIL_L4_ERROR, NET_FAIL_L7_TIMEOUT, + NET_FAIL_L7_PREMATURE_END, }; [[nodiscard]] const char* netStatusToString(netStatus status); diff --git a/src/vendor/Soup/soup/os.cpp b/src/vendor/Soup/soup/os.cpp index 945cc567aa..803c4199ba 100644 --- a/src/vendor/Soup/soup/os.cpp +++ b/src/vendor/Soup/soup/os.cpp @@ -134,11 +134,10 @@ NAMESPACE_SOUP #endif } +#if !SOUP_WINDOWS void os::sleep(unsigned int ms) noexcept { -#ifdef _WIN32 - ::Sleep(ms); -#elif _POSIX_C_SOURCE >= 199309L +#if _POSIX_C_SOURCE >= 199309L struct timespec ts; ts.tv_sec = ms / 1000; ts.tv_nsec = (ms % 1000) * 1000000; @@ -155,6 +154,7 @@ NAMESPACE_SOUP ::usleep((ms % 1000) * 1000); #endif } +#endif #if SOUP_WINDOWS static bool copy_to_clipboard_utf16(const std::wstring& text) diff --git a/src/vendor/Soup/soup/os.hpp b/src/vendor/Soup/soup/os.hpp index 9673b4618f..9d6b566324 100644 --- a/src/vendor/Soup/soup/os.hpp +++ b/src/vendor/Soup/soup/os.hpp @@ -58,4 +58,11 @@ NAMESPACE_SOUP } #endif }; + +#if SOUP_WINDOWS + inline void os::sleep(unsigned int ms) noexcept + { + ::Sleep(ms); + } +#endif } diff --git a/src/vendor/Soup/soup/sha1.cpp b/src/vendor/Soup/soup/sha1.cpp index a389475138..160c49d3c5 100644 --- a/src/vendor/Soup/soup/sha1.cpp +++ b/src/vendor/Soup/soup/sha1.cpp @@ -14,20 +14,20 @@ #endif #include "StringRefReader.hpp" -#if SHA1_USE_INTRIN -namespace soup_intrin +NAMESPACE_SOUP { - extern void sha1_transform(uint32_t state[5], const uint8_t data[64]) noexcept; -} +#if SHA1_USE_INTRIN + namespace intrin + { + extern void sha1_transform(uint32_t state[5], const uint8_t data[64]) noexcept; + } #endif -NAMESPACE_SOUP -{ // Original source: https://github.com/vog/sha1 // Original licence: Dedicated to the public domain. template - void buffer_to_block(const std::string& buffer, uint32_t block[BLOCK_INTS]) noexcept + void buffer_to_block(const uint8_t* buffer, uint32_t block[BLOCK_INTS]) noexcept { /* Convert the std::string (byte buffer) to a uint32_t array */ for (size_t i = 0; i < BLOCK_INTS; i++) @@ -103,7 +103,7 @@ NAMESPACE_SOUP * Hash a single 512-bit block. This is the core of the algorithm. */ - inline static void transform(uint32_t digest[], uint32_t block[BLOCK_INTS]) noexcept + inline static void transform_impl(uint32_t digest[], uint32_t block[BLOCK_INTS]) noexcept { /* Copy digest[] to working vars */ uint32_t a = digest[0]; @@ -202,139 +202,110 @@ NAMESPACE_SOUP digest[4] += e; } - std::string sha1::hash(const std::string& str) SOUP_EXCAL + std::string sha1::hash(const void* data, size_t len) SOUP_EXCAL { - StringRefReader r(str); - return hash(r); + State sha; + sha.append(data, len); + sha.finalise(); + return sha.getDigest(); } - template - static std::string sha1_hash_impl(Reader& r) SOUP_EXCAL + std::string sha1::hash(const std::string& str) SOUP_EXCAL { - // init - uint32_t digest[] = { - 0x67452301, - 0xefcdab89, - 0x98badcfe, - 0x10325476, - 0xc3d2e1f0, - }; - static_assert(sizeof(digest) == sha1::DIGEST_BYTES); - - std::string buffer{}; - uint64_t transforms = 0; - - size_t in_len = r.getRemainingBytes(); + return hash(str.data(), str.size()); + } - // update - for (; in_len >= sha1::BLOCK_BYTES; in_len -= sha1::BLOCK_BYTES) + std::string sha1::hash(Reader& r) SOUP_EXCAL + { + State sha; + while (r.hasMore()) { - r.str(sha1::BLOCK_BYTES, buffer); + uint8_t b; + r.u8(b); + sha.appendByte(b); + } + sha.finalise(); + return sha.getDigest(); + } + #if SHA1_USE_INTRIN - if constexpr (intrin) - { + [[nodiscard]] static bool sha1_can_use_intrin() noexcept + { #if SOUP_X86 - soup_intrin::sha1_transform(digest, (const uint8_t*)buffer.data()); - #else - uint32_t block[BLOCK_INTS]; - buffer_to_block(buffer, block); - soup_intrin::sha1_transform(digest, (const uint8_t*)block); + const CpuInfo& cpu_info = CpuInfo::get(); + return cpu_info.supportsSSSE3() + && cpu_info.supportsSHA() + ; + #elif SOUP_ARM + return CpuInfo::get().armv8_sha1; #endif - } - else + } #endif - { - uint32_t block[BLOCK_INTS]; - buffer_to_block(buffer, block); - transform(digest, block); - } - ++transforms; - } - - r.str(in_len, buffer); - - // final - - /* Total number of hashed bits */ - uint64_t total_bits = (transforms * sha1::BLOCK_BYTES + buffer.size()) * 8; - - /* Padding */ - buffer += (char)0x80; - size_t orig_size = buffer.size(); - while (buffer.size() < sha1::BLOCK_BYTES) - { - buffer += (char)0x00; - } - uint32_t block[BLOCK_INTS]; - buffer_to_block(buffer, block); - - if (orig_size > sha1::BLOCK_BYTES - 8) - { -#if SHA1_USE_INTRIN - if constexpr (intrin) - { - soup_intrin::sha1_transform(digest, (const uint8_t*)block); - } - else -#endif - { - transform(digest, block); - } - for (size_t i = 0; i < BLOCK_INTS - 2; i++) - { - block[i] = 0; - } - } + sha1::State::State() + { + state[0] = 0x67452301; + state[1] = 0xefcdab89; + state[2] = 0x98badcfe; + state[3] = 0x10325476; + state[4] = 0xc3d2e1f0; + buffer_counter = 0; + n_bits = 0; + } - /* Append total_bits, split this uint64_t into two uint32_t */ + void sha1::State::transform() noexcept + { #if SHA1_USE_INTRIN - if constexpr (intrin) + static bool good_cpu = sha1_can_use_intrin(); + if (good_cpu) { #if SOUP_X86 - block[BLOCK_INTS - 1] = Endianness::invert((uint32_t)total_bits); - block[BLOCK_INTS - 2] = Endianness::invert((uint32_t)(total_bits >> 32)); + intrin::sha1_transform(state, buffer); #else - block[BLOCK_INTS - 1] = (uint32_t)total_bits; - block[BLOCK_INTS - 2] = (uint32_t)(total_bits >> 32); + uint32_t block[BLOCK_INTS]; + buffer_to_block(buffer, block); + intrin::sha1_transform(state, (const uint8_t*)block); #endif - soup_intrin::sha1_transform(digest, (const uint8_t*)block); + return; } - else #endif + uint32_t block[BLOCK_INTS]; + buffer_to_block(buffer, block); + transform_impl(state, block); + } + + void sha1::State::finalise() noexcept + { + uint64_t n_bits = this->n_bits; + + appendByte(0x80); + + while (buffer_counter != 56) { - block[BLOCK_INTS - 1] = (uint32_t)total_bits; - block[BLOCK_INTS - 2] = (uint32_t)(total_bits >> 32); - transform(digest, block); + appendByte(0); } - std::string bin{}; - bin.reserve(sha1::DIGEST_BYTES); - for (size_t i = 0; i < sizeof(digest) / sizeof(digest[0]); i++) + for (int i = 7; i >= 0; i--) { - bin.push_back(((const char*)&digest[i])[3]); - bin.push_back(((const char*)&digest[i])[2]); - bin.push_back(((const char*)&digest[i])[1]); - bin.push_back(((const char*)&digest[i])[0]); + appendByte((n_bits >> 8 * i) & 0xff); } - return bin; } - std::string sha1::hash(Reader& r) SOUP_EXCAL + void sha1::State::getDigest(uint8_t out[DIGEST_BYTES]) const noexcept { -#if SHA1_USE_INTRIN - const CpuInfo& cpu_info = CpuInfo::get(); - #if SOUP_X86 - if (cpu_info.supportsSSSE3() - && cpu_info.supportsSHA() - ) - #elif SOUP_ARM - if (cpu_info.armv8_sha1) - #endif + for (unsigned int i = 0; i != DIGEST_BYTES / 4; i++) { - return sha1_hash_impl(r); + for (int j = 3; j >= 0; j--) + { + *out++ = (state[i] >> j * 8) & 0xff; + } } -#endif - return sha1_hash_impl(r); + } + + std::string sha1::State::getDigest() const SOUP_EXCAL + { + std::string digest(DIGEST_BYTES, '\0'); + getDigest((uint8_t*)digest.data()); + return digest; } } diff --git a/src/vendor/Soup/soup/sha1.hpp b/src/vendor/Soup/soup/sha1.hpp index 2198ed6ce8..7cecaa783c 100644 --- a/src/vendor/Soup/soup/sha1.hpp +++ b/src/vendor/Soup/soup/sha1.hpp @@ -12,7 +12,44 @@ NAMESPACE_SOUP static constexpr auto DIGEST_BYTES = 20u; static constexpr auto BLOCK_BYTES = 64u; + [[nodiscard]] static std::string hash(const void* data, size_t len) SOUP_EXCAL; [[nodiscard]] static std::string hash(const std::string& str) SOUP_EXCAL; [[nodiscard]] static std::string hash(Reader& r) SOUP_EXCAL; + + struct State + { + uint8_t buffer[BLOCK_BYTES]; + uint32_t state[5]; + uint8_t buffer_counter; + uint64_t n_bits; + + State(); + + void append(const void* data, size_t size) noexcept + { + for (size_t i = 0; i != size; ++i) + { + appendByte(reinterpret_cast(data)[i]); + } + } + + void appendByte(uint8_t byte) noexcept + { + buffer[buffer_counter++] = byte; + n_bits += 8; + + if (buffer_counter == BLOCK_BYTES) + { + buffer_counter = 0; + transform(); + } + } + + void transform() noexcept; + void finalise() noexcept; + + void getDigest(uint8_t out[DIGEST_BYTES]) const noexcept; + [[nodiscard]] std::string getDigest() const SOUP_EXCAL; + }; }; } diff --git a/src/vendor/Soup/soup/sha256.cpp b/src/vendor/Soup/soup/sha256.cpp index 836dd4569a..e97e1ebb0b 100644 --- a/src/vendor/Soup/soup/sha256.cpp +++ b/src/vendor/Soup/soup/sha256.cpp @@ -18,15 +18,15 @@ Original source: https://github.com/983/SHA-256 Original licence: Dedicated to the public domain. */ -#if SHA256_USE_INTRIN -namespace soup_intrin +NAMESPACE_SOUP { - extern void sha256_transform(uint32_t state[8], const uint8_t data[64]) noexcept; -} +#if SHA256_USE_INTRIN + namespace intrin + { + extern void sha256_transform(uint32_t state[8], const uint8_t data[64]) noexcept; + } #endif -NAMESPACE_SOUP -{ struct sha256_state { uint32_t state[8]; @@ -105,17 +105,57 @@ NAMESPACE_SOUP 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, }; - static void sha256_block(sha256_state* sha) noexcept + std::string sha256::hash(const void* data, size_t len) SOUP_EXCAL + { + State sha; + sha.append(data, len); + sha.finalise(); + return sha.getDigest(); + } + + std::string sha256::hash(const std::string& str) SOUP_EXCAL + { + return hash(str.data(), str.size()); + } + + std::string sha256::hash(Reader& r) SOUP_EXCAL + { + State sha; + while (r.hasMore()) + { + uint8_t b; + r.u8(b); + sha.appendByte(b); + } + sha.finalise(); + return sha.getDigest(); + } + + sha256::State::State() noexcept + { + state[0] = 0x6a09e667; + state[1] = 0xbb67ae85; + state[2] = 0x3c6ef372; + state[3] = 0xa54ff53a; + state[4] = 0x510e527f; + state[5] = 0x9b05688c; + state[6] = 0x1f83d9ab; + state[7] = 0x5be0cd19; + buffer_counter = 0; + n_bits = 0; + } + + void sha256::State::transform() noexcept { #if SHA256_USE_INTRIN static bool good_cpu = sha256_can_use_intrin(); if (good_cpu) { - return soup_intrin::sha256_transform(sha->state, sha->buffer); + return intrin::sha256_transform(this->state, this->buffer); } #endif - uint32_t* state = sha->state; + uint32_t* state = this->state; uint32_t a = state[0]; uint32_t b = state[1]; @@ -130,7 +170,7 @@ NAMESPACE_SOUP int i, j; for (i = 0; i < 64; i += 16) { - update_w(w, i, sha->buffer); + update_w(w, i, buffer); for (j = 0; j < 16; j += 4) { uint32_t temp; @@ -159,102 +199,38 @@ NAMESPACE_SOUP state[7] += h; } - void sha256_init(sha256_state* sha) noexcept - { - sha->state[0] = 0x6a09e667; - sha->state[1] = 0xbb67ae85; - sha->state[2] = 0x3c6ef372; - sha->state[3] = 0xa54ff53a; - sha->state[4] = 0x510e527f; - sha->state[5] = 0x9b05688c; - sha->state[6] = 0x1f83d9ab; - sha->state[7] = 0x5be0cd19; - sha->n_bits = 0; - sha->buffer_counter = 0; - } - - void sha256_append_byte(sha256_state* sha, uint8_t byte) noexcept + void sha256::State::finalise() noexcept { - sha->buffer[sha->buffer_counter++] = byte; - sha->n_bits += 8; + uint64_t n_bits = this->n_bits; - if (sha->buffer_counter == 64) { - sha->buffer_counter = 0; - sha256_block(sha); - } - } + appendByte(0x80); - void sha256_append(sha256_state* sha, const void* src, size_t n_bytes) noexcept - { - const uint8_t* bytes = (const uint8_t*)src; - size_t i; - - for (i = 0; i < n_bytes; i++) { - sha256_append_byte(sha, bytes[i]); - } - } - - void sha256_finalize(sha256_state* sha) noexcept - { - int i; - uint64_t n_bits = sha->n_bits; - - sha256_append_byte(sha, 0x80); - - while (sha->buffer_counter != 56) { - sha256_append_byte(sha, 0); + while (buffer_counter != 56) + { + appendByte(0); } - for (i = 7; i >= 0; i--) { - uint8_t byte = (n_bits >> 8 * i) & 0xff; - sha256_append_byte(sha, byte); + for (int i = 7; i >= 0; i--) + { + appendByte((n_bits >> 8 * i) & 0xff); } } - void sha256_finalize_bytes(sha256_state* sha, uint8_t dst_bytes[32]) noexcept + void sha256::State::getDigest(uint8_t out[DIGEST_BYTES]) const noexcept { - int i, j; - sha256_finalize(sha); - - for (i = 0; i < 8; i++) { - for (j = 3; j >= 0; j--) { - *dst_bytes++ = (sha->state[i] >> j * 8) & 0xff; + for (unsigned int i = 0; i != DIGEST_BYTES / 4; i++) + { + for (int j = 3; j >= 0; j--) + { + *out++ = (state[i] >> j * 8) & 0xff; } } } - void sha256_bytes(const void* src, size_t n_bytes, uint8_t dst_bytes[32]) noexcept - { - sha256_state sha; - sha256_init(&sha); - sha256_append(&sha, src, n_bytes); - sha256_finalize_bytes(&sha, dst_bytes); - } - - std::string sha256::hash(const void* data, size_t len) SOUP_EXCAL - { - std::string digest(32, '\0'); - sha256_bytes(data, len, (uint8_t*)digest.data()); - return digest; - } - - std::string sha256::hash(const std::string& str) SOUP_EXCAL - { - return hash(str.data(), str.size()); - } - - std::string sha256::hash(Reader& r) SOUP_EXCAL + std::string sha256::State::getDigest() const SOUP_EXCAL { - std::string digest(32, '\0'); - sha256_state sha; - sha256_init(&sha); - while (r.hasMore()) - { - uint8_t b; - r.u8(b); - sha256_append_byte(&sha, b); - } - sha256_finalize_bytes(&sha, (uint8_t*)digest.data()); + std::string digest(DIGEST_BYTES, '\0'); + getDigest((uint8_t*)digest.data()); return digest; } } diff --git a/src/vendor/Soup/soup/sha256.hpp b/src/vendor/Soup/soup/sha256.hpp index 0f95e1adde..3049331a05 100644 --- a/src/vendor/Soup/soup/sha256.hpp +++ b/src/vendor/Soup/soup/sha256.hpp @@ -15,5 +15,41 @@ NAMESPACE_SOUP [[nodiscard]] static std::string hash(const void* data, size_t len) SOUP_EXCAL; [[nodiscard]] static std::string hash(const std::string& str) SOUP_EXCAL; [[nodiscard]] static std::string hash(Reader& r) SOUP_EXCAL; + + struct State + { + uint8_t buffer[BLOCK_BYTES]; + uint32_t state[8]; + uint8_t buffer_counter; + uint64_t n_bits; + + State() noexcept; + + void append(const void* data, size_t size) noexcept + { + for (size_t i = 0; i != size; ++i) + { + appendByte(reinterpret_cast(data)[i]); + } + } + + void appendByte(uint8_t byte) noexcept + { + buffer[buffer_counter++] = byte; + n_bits += 8; + + if (buffer_counter == BLOCK_BYTES) + { + buffer_counter = 0; + transform(); + } + } + + void transform() noexcept; + void finalise() noexcept; + + void getDigest(uint8_t out[DIGEST_BYTES]) const noexcept; + [[nodiscard]] std::string getDigest() const SOUP_EXCAL; + }; }; } diff --git a/src/vendor/Soup/soup/sha384.cpp b/src/vendor/Soup/soup/sha384.cpp index 4dd55e7731..828d8a19d7 100644 --- a/src/vendor/Soup/soup/sha384.cpp +++ b/src/vendor/Soup/soup/sha384.cpp @@ -1,10 +1,5 @@ #include "sha384.hpp" -#include // memcpy - -#include "sha512.hpp" -#include "StringWriter.hpp" - /* Original source: https://github.com/pr0f3ss/SHA Original licence follows. @@ -36,58 +31,47 @@ NAMESPACE_SOUP { std::string sha384::hash(const void* data, size_t len) SOUP_EXCAL { - uint64_t h[HASH_LEN]; // buffer holding the message digest (512-bit = 8 64-bit words) - memcpy(h, hPrime, WORKING_VAR_LEN * sizeof(uint64_t)); + State sha; + sha.append(data, len); + sha.finalise(); + return sha.getDigest(); + } + + std::string sha384::hash(const std::string& str) SOUP_EXCAL + { + return hash(str.data(), str.size()); + } - const size_t l = len * CHAR_LEN_BITS; // length of input in bits - const size_t k = (896 - 1 - l) % MESSAGE_BLOCK_SIZE; // length of zero bit padding (l + 1 + k = 896 mod 1024) - const size_t nBuffer = (l + 1 + k + 128) / MESSAGE_BLOCK_SIZE; + sha384::State::State() noexcept + { + // implicitly calls sha512::State::State() + state[0] = 0xcbbb9d5dc1059ed8ULL; + state[1] = 0x629a292a367cd507ULL; + state[2] = 0x9159015a3070dd17ULL; + state[3] = 0x152fecd8f70e5939ULL; + state[4] = 0x67332667ffc00b31ULL; + state[5] = 0x8eb44a8768581511ULL; + state[6] = 0xdb0c2e0d64f98fa7ULL; + state[7] = 0x47b5481dbefa4fa4ULL; + //buffer_counter = 0; + //n_bits = 0; + } - for (size_t i = 0; i != nBuffer; ++i) + void sha384::State::getDigest(uint8_t out[DIGEST_BYTES]) const noexcept + { + for (unsigned int i = 0; i != DIGEST_BYTES / 8; i++) { - uint64_t buffer[SEQUENCE_LEN]; - for (size_t j = 0; j != SEQUENCE_LEN; ++j) - { - uint64_t in = 0x0ULL; - for (size_t k = 0; k != WORD_LEN; ++k) - { - size_t index = i * 128 + j * 8 + k; - if (index < len) - { - in = in << 8 | (uint64_t)reinterpret_cast(data)[index]; - } - else if (index == len) - { - in = in << 8 | 0x80ULL; - } - else - { - in = in << 8 | 0x0ULL; - } - } - buffer[j] = in; - } - if (i == nBuffer - 1) + for (int j = 7; j >= 0; j--) { - buffer[SEQUENCE_LEN - 1] = l; - buffer[SEQUENCE_LEN - 2] = 0x00ULL; + *out++ = (state[i] >> j * 8) & 0xff; } - sha512::processBlock(buffer, h); } - - StringWriter sw; - sw.data.reserve(6 * 8); - sw.u64_be(h[0]); - sw.u64_be(h[1]); - sw.u64_be(h[2]); - sw.u64_be(h[3]); - sw.u64_be(h[4]); - sw.u64_be(h[5]); - return std::move(sw.data); } - std::string sha384::hash(const std::string& str) SOUP_EXCAL + std::string sha384::State::getDigest() const SOUP_EXCAL { - return hash(str.data(), str.size()); + std::string digest(DIGEST_BYTES, '\0'); + getDigest((uint8_t*)digest.data()); + return digest; } } diff --git a/src/vendor/Soup/soup/sha384.hpp b/src/vendor/Soup/soup/sha384.hpp index 92748c855f..1b7edd4b31 100644 --- a/src/vendor/Soup/soup/sha384.hpp +++ b/src/vendor/Soup/soup/sha384.hpp @@ -1,9 +1,7 @@ #pragma once #include "CryptoHashAlgo.hpp" - -#include -#include +#include "sha512.hpp" NAMESPACE_SOUP { @@ -16,14 +14,12 @@ NAMESPACE_SOUP [[nodiscard]] static std::string hash(const void* data, size_t len) SOUP_EXCAL; [[nodiscard]] static std::string hash(const std::string& str) SOUP_EXCAL; + struct State : public sha512::State + { + State() noexcept; - static constexpr unsigned int SEQUENCE_LEN = (1024 / 64); - static constexpr size_t HASH_LEN = 8; - static constexpr size_t WORKING_VAR_LEN = 8; - static constexpr size_t MESSAGE_BLOCK_SIZE = 1024; - static constexpr size_t CHAR_LEN_BITS = 8; - static constexpr size_t WORD_LEN = 8; - - static constexpr const uint64_t hPrime[8] = { 0xcbbb9d5dc1059ed8ULL, 0x629a292a367cd507ULL, 0x9159015a3070dd17ULL, 0x152fecd8f70e5939ULL, 0x67332667ffc00b31ULL, 0x8eb44a8768581511ULL, 0xdb0c2e0d64f98fa7ULL, 0x47b5481dbefa4fa4ULL }; + void getDigest(uint8_t out[DIGEST_BYTES]) const noexcept; + [[nodiscard]] std::string getDigest() const SOUP_EXCAL; + }; }; } diff --git a/src/vendor/Soup/soup/sha512.cpp b/src/vendor/Soup/soup/sha512.cpp index 95f180b774..23ab4c8fac 100644 --- a/src/vendor/Soup/soup/sha512.cpp +++ b/src/vendor/Soup/soup/sha512.cpp @@ -2,7 +2,7 @@ #include // memcpy -#include "StringWriter.hpp" +#include "Endian.hpp" /* Original source: https://github.com/pr0f3ss/SHA @@ -35,56 +35,10 @@ NAMESPACE_SOUP { std::string sha512::hash(const void* data, size_t len) SOUP_EXCAL { - uint64_t h[HASH_LEN]; // buffer holding the message digest (512-bit = 8 64-bit words) - memcpy(h, hPrime, WORKING_VAR_LEN * sizeof(uint64_t)); - - const size_t l = len * CHAR_LEN_BITS; // length of input in bits - const size_t k = (896 - 1 - l) % MESSAGE_BLOCK_SIZE; // length of zero bit padding (l + 1 + k = 896 mod 1024) - const size_t nBuffer = (l + 1 + k + 128) / MESSAGE_BLOCK_SIZE; - - for (size_t i = 0; i != nBuffer; ++i) - { - uint64_t buffer[SEQUENCE_LEN]; - for (size_t j = 0; j != SEQUENCE_LEN; ++j) - { - uint64_t in = 0x0ULL; - for (size_t k = 0; k != WORD_LEN; ++k) - { - size_t index = i * 128 + j * 8 + k; - if (index < len) - { - in = in << 8 | (uint64_t)reinterpret_cast(data)[index]; - } - else if (index == len) - { - in = in << 8 | 0x80ULL; - } - else - { - in = in << 8 | 0x0ULL; - } - } - buffer[j] = in; - } - if (i == nBuffer - 1) - { - buffer[SEQUENCE_LEN - 1] = l; - buffer[SEQUENCE_LEN - 2] = 0x00ULL; - } - processBlock(buffer, h); - } - - StringWriter sw; - sw.data.reserve(8 * 8); - sw.u64_be(h[0]); - sw.u64_be(h[1]); - sw.u64_be(h[2]); - sw.u64_be(h[3]); - sw.u64_be(h[4]); - sw.u64_be(h[5]); - sw.u64_be(h[6]); - sw.u64_be(h[7]); - return std::move(sw.data); + State sha; + sha.append(data, len); + sha.finalise(); + return sha.getDigest(); } std::string sha512::hash(const std::string& str) SOUP_EXCAL @@ -110,6 +64,7 @@ NAMESPACE_SOUP 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL }; static constexpr size_t MESSAGE_SCHEDULE_LEN = 80; + static constexpr size_t WORKING_VAR_LEN = 8; #define Ch(x,y,z) ((x&y)^(~x&z)) #define Maj(x,y,z) ((x&y)^(x&z)^(y&z)) @@ -119,20 +74,36 @@ NAMESPACE_SOUP #define sig0(x) (RotR(x, 1)^RotR(x,8)^(x>>7)) #define sig1(x) (RotR(x, 19)^RotR(x,61)^(x>>6)) - void sha512::processBlock(uint64_t block[16], uint64_t h[8]) + sha512::State::State() noexcept { - uint64_t s[WORKING_VAR_LEN]; - uint64_t w[MESSAGE_SCHEDULE_LEN]; - - // copy over to message schedule - memcpy(w, block, SEQUENCE_LEN * sizeof(uint64_t)); + state[0] = 0x6a09e667f3bcc908ULL; + state[1] = 0xbb67ae8584caa73bULL; + state[2] = 0x3c6ef372fe94f82bULL; + state[3] = 0xa54ff53a5f1d36f1ULL; + state[4] = 0x510e527fade682d1ULL; + state[5] = 0x9b05688c2b3e6c1fULL; + state[6] = 0x1f83d9abfb41bd6bULL; + state[7] = 0x5be0cd19137e2179ULL; + buffer_counter = 0; + n_bits = 0; + } - // Prepare the message schedule + void sha512::State::transform() noexcept + { + // Initialise message schedule + uint64_t w[MESSAGE_SCHEDULE_LEN]; + for (size_t i = 0; i != BLOCK_BYTES / 8; ++i) + { + w[i] = Endianness::invert(reinterpret_cast(buffer)[i]); + } for (size_t j = 16; j != MESSAGE_SCHEDULE_LEN; ++j) { w[j] = w[j - 16] + sig0(w[j - 15]) + w[j - 7] + sig1(w[j - 2]); } - // Initialize the working variables + + // Initialise the working variables + uint64_t* const h = this->state; + uint64_t s[WORKING_VAR_LEN]; memcpy(s, h, WORKING_VAR_LEN * sizeof(uint64_t)); // Compression @@ -157,4 +128,39 @@ NAMESPACE_SOUP h[j] += s[j]; } } + + void sha512::State::finalise() noexcept + { + uint64_t n_bits = this->n_bits; + + appendByte(0x80); + + while (buffer_counter != 120) + { + appendByte(0); + } + + for (int i = 7; i >= 0; i--) + { + appendByte((n_bits >> 8 * i) & 0xff); + } + } + + void sha512::State::getDigest(uint8_t out[DIGEST_BYTES]) const noexcept + { + for (unsigned int i = 0; i != DIGEST_BYTES / 8; i++) + { + for (int j = 7; j >= 0; j--) + { + *out++ = (state[i] >> j * 8) & 0xff; + } + } + } + + std::string sha512::State::getDigest() const SOUP_EXCAL + { + std::string digest(DIGEST_BYTES, '\0'); + getDigest((uint8_t*)digest.data()); + return digest; + } } diff --git a/src/vendor/Soup/soup/sha512.hpp b/src/vendor/Soup/soup/sha512.hpp index 99537dc4ac..ca916b7ea2 100644 --- a/src/vendor/Soup/soup/sha512.hpp +++ b/src/vendor/Soup/soup/sha512.hpp @@ -16,16 +16,40 @@ NAMESPACE_SOUP [[nodiscard]] static std::string hash(const void* data, size_t len) SOUP_EXCAL; [[nodiscard]] static std::string hash(const std::string& str) SOUP_EXCAL; - static void processBlock(uint64_t block[16], uint64_t h[8]); - - - static constexpr unsigned int SEQUENCE_LEN = (1024 / 64); - static constexpr size_t HASH_LEN = 8; - static constexpr size_t WORKING_VAR_LEN = 8; - static constexpr size_t MESSAGE_BLOCK_SIZE = 1024; - static constexpr size_t CHAR_LEN_BITS = 8; - static constexpr size_t WORD_LEN = 8; - - static constexpr const uint64_t hPrime[8] = { 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL }; + struct State + { + uint8_t buffer[BLOCK_BYTES]; + uint64_t state[8]; + uint8_t buffer_counter; + uint64_t n_bits; + + State() noexcept; + + void append(const void* data, size_t size) noexcept + { + for (size_t i = 0; i != size; ++i) + { + appendByte(reinterpret_cast(data)[i]); + } + } + + void appendByte(uint8_t byte) noexcept + { + buffer[buffer_counter++] = byte; + n_bits += 8; + + if (buffer_counter == BLOCK_BYTES) + { + buffer_counter = 0; + transform(); + } + } + + void transform() noexcept; + void finalise() noexcept; + + void getDigest(uint8_t out[DIGEST_BYTES]) const noexcept; + [[nodiscard]] std::string getDigest() const SOUP_EXCAL; + }; }; } diff --git a/src/vendor/Soup/soup/string.cpp b/src/vendor/Soup/soup/string.cpp index c884ab579e..bf171692ce 100644 --- a/src/vendor/Soup/soup/string.cpp +++ b/src/vendor/Soup/soup/string.cpp @@ -144,6 +144,7 @@ NAMESPACE_SOUP std::string ret; if (std::filesystem::exists(file)) { +#if SOUP_WINDOWS // kinda messes with hwHid on Linux, also unsure if memory mapping is faster than direct file access on Linux. size_t len; if (auto addr = soup::filesystem::createFileMapping(file, len)) { @@ -151,6 +152,7 @@ NAMESPACE_SOUP soup::filesystem::destroyFileMapping(addr, len); } else // File might be open in another process, causing memory mapping to fail. +#endif { std::ifstream t(file, std::ios::binary); From b4eab5bdfe9a1256ff3f894190462feafe29e2b4 Mon Sep 17 00:00:00 2001 From: Sainan Date: Mon, 12 Aug 2024 17:43:43 +0200 Subject: [PATCH 3/3] Avoid copying string for socket.send --- src/lsocketlib.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lsocketlib.cpp b/src/lsocketlib.cpp index a60a7a71f8..0beda7989e 100644 --- a/src/lsocketlib.cpp +++ b/src/lsocketlib.cpp @@ -90,7 +90,9 @@ static int l_connect (lua_State *L) { } static int l_send (lua_State *L) { - checksocket(L, 1)->sock->send(pluto_checkstring(L, 2)); + size_t len; + const char *str = luaL_checklstring(L, 2, &len); + checksocket(L, 1)->sock->send(str, len); return 0; }