diff --git a/AUTHORS b/AUTHORS index 2439d7a452..4c3bd93f7c 100644 --- a/AUTHORS +++ b/AUTHORS @@ -10,3 +10,4 @@ Sanjay Ghemawat # Partial list of contributors: Kevin Regan Johan Bilien +Fangming Fang diff --git a/Makefile b/Makefile index f7cc7d736c..0c8674a94a 100644 --- a/Makefile +++ b/Makefile @@ -422,3 +422,9 @@ $(STATIC_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc $(SHARED_OUTDIR)/port/port_posix_sse.o: port/port_posix_sse.cc $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_SSEFLAGS) -c $< -o $@ + +$(STATIC_OUTDIR)/port/port_posix_linux_arm64.o: port/port_posix_linux_arm64.cc + $(CXX) $(CXXFLAGS) $(PLATFORM_ARMV8_CRC32FLAGS) -c $< -o $@ + +$(SHARED_OUTDIR)/port/port_posix_linux_arm64.o: port/port_posix_linux_arm64.cc + $(CXX) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(PLATFORM_ARMV8_CRC32FLAGS) -c $< -o $@ diff --git a/build_detect_platform b/build_detect_platform index d2a20ce5b6..c1859396ea 100755 --- a/build_detect_platform +++ b/build_detect_platform @@ -53,6 +53,11 @@ if test -z "$TARGET_OS"; then TARGET_OS=`uname -s` fi +# Detect ARCH +if test -z "$TARGET_ARCH"; then + TARGET_ARCH=`$CXX -dumpmachine | cut -d- -f1` +fi + COMMON_FLAGS= CROSS_COMPILE= PLATFORM_CCFLAGS= @@ -64,6 +69,7 @@ PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," PLATFORM_SHARED_CFLAGS="-fPIC" PLATFORM_SHARED_VERSIONED=true PLATFORM_SSEFLAGS= +PORT_CRC_FILE_ARMV8= MEMCMP_FLAG= if [ "$CXX" = "g++" ]; then @@ -164,6 +170,11 @@ case "$TARGET_OS" in exit 1 esac +if [ $TARGET_ARCH = "aarch64" -a $PLATFORM = "OS_LINUX" ];then + PORT_SSE_FILE= + PORT_CRC_FILE_ARMV8=port/port_posix_linux_arm64.cc +fi + # We want to make a list of all cc files within util, db, table, and helpers # except for the test and benchmark files. By default, find will output a list # of all files matching either rule, so we need to append -print to make the @@ -180,7 +191,7 @@ set +f # re-enable globbing # The sources consist of the portable files, plus the platform-specific port # file. -echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE" >> $OUTPUT +echo "SOURCES=$PORTABLE_FILES $PORT_FILE $PORT_SSE_FILE $PORT_CRC_FILE_ARMV8" >> $OUTPUT echo "MEMENV_SOURCES=helpers/memenv/memenv.cc" >> $OUTPUT if [ "$CROSS_COMPILE" = "true" ]; then @@ -232,6 +243,18 @@ EOF fi rm -f $CXXOUTPUT 2>/dev/null + + # Test if gcc armv8-a+crc+crypto is supported + $CXX $CXXFLAGS -x c++ - -o $CXXOUTPUT -march=armv8-a+crc+crypto 2>/dev/null < + #include + int main() {__crc32cd(0, 0); vmull_p64(0, 0);} +EOF + if [ "$?" = 0 ]; then + PLATFORM_ARMV8_CRC32FLAGS="-march=armv8-a+crc+crypto" + fi + + rm -f $CXXOUTPUT 2>/dev/null fi # Use the SSE 4.2 CRC32C intrinsics iff runtime checks indicate compiler supports them. @@ -239,6 +262,11 @@ if [ -n "$PLATFORM_SSEFLAGS" ]; then PLATFORM_SSEFLAGS="$PLATFORM_SSEFLAGS -DLEVELDB_PLATFORM_POSIX_SSE" fi +# Use the ARMv8 CRC32C intrinsics if runtime checks indicate compiler supports them. +if [ -n "$PLATFORM_ARMV8_CRC32FLAGS" ]; then + PLATFORM_ARMV8_CRC32FLAGS="$PLATFORM_ARMV8_CRC32FLAGS -DLEVELDB_PLATFORM_POSIX_ARMV8_CRC_CRYPTO" +fi + PLATFORM_CCFLAGS="$PLATFORM_CCFLAGS $COMMON_FLAGS" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS $COMMON_FLAGS" @@ -254,3 +282,4 @@ echo "PLATFORM_SHARED_CFLAGS=$PLATFORM_SHARED_CFLAGS" >> $OUTPUT echo "PLATFORM_SHARED_EXT=$PLATFORM_SHARED_EXT" >> $OUTPUT echo "PLATFORM_SHARED_LDFLAGS=$PLATFORM_SHARED_LDFLAGS" >> $OUTPUT echo "PLATFORM_SHARED_VERSIONED=$PLATFORM_SHARED_VERSIONED" >> $OUTPUT +echo "PLATFORM_ARMV8_CRC32FLAGS=$PLATFORM_ARMV8_CRC32FLAGS" >> $OUTPUT diff --git a/port/port_posix_linux_arm64.cc b/port/port_posix_linux_arm64.cc new file mode 100644 index 0000000000..b8c023bb9d --- /dev/null +++ b/port/port_posix_linux_arm64.cc @@ -0,0 +1,139 @@ +// Copyright 2017 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +// +// A portable implementation of crc32c, optimized to handle +// up to eight bytes at a time. +// +// In a separate source file to allow this accelerated CRC32C function to be +// compiled with the appropriate compiler flags to enable aarch64 crc32 +// instructions. + +#include +#include +#include "port/port.h" + + +#if defined(LEVELDB_PLATFORM_POSIX_ARMV8_CRC_CRYPTO) + +#include +#include +#include + +// see kernel file 'arch/arm64/include/uapi/asm/hwcap.h' +#define HWCAP_CRC32 (1 << 7) +#define HWCAP_PMULL (1 << 4) + +#define KBYTES 1032 +#define SEGMENTBYTES 256 + +// compute 8bytes for each segment parallelly +#define CRC32C32BYTES(P, IND) do {\ + crc1 = __crc32cd(crc1, *((const uint64_t *)(P) + (SEGMENTBYTES/8)*1 + (IND)));\ + crc2 = __crc32cd(crc2, *((const uint64_t *)(P) + (SEGMENTBYTES/8)*2 + (IND)));\ + crc3 = __crc32cd(crc3, *((const uint64_t *)(P) + (SEGMENTBYTES/8)*3 + (IND)));\ + crc0 = __crc32cd(crc0, *((const uint64_t *)(P) + (SEGMENTBYTES/8)*0 + (IND)));\ + } while(0); + +// compute 8*8 bytes for each segment parallelly +#define CRC32C256BYTES(P, IND) do {\ + CRC32C32BYTES((P), (IND)*8+0) \ + CRC32C32BYTES((P), (IND)*8+1) \ + CRC32C32BYTES((P), (IND)*8+2) \ + CRC32C32BYTES((P), (IND)*8+3) \ + CRC32C32BYTES((P), (IND)*8+4) \ + CRC32C32BYTES((P), (IND)*8+5) \ + CRC32C32BYTES((P), (IND)*8+6) \ + CRC32C32BYTES((P), (IND)*8+7) \ + } while(0); + +// compute 4*8*8 bytes for each segment parallelly +#define CRC32C1024BYTES(P) do {\ + CRC32C256BYTES((P), 0) \ + CRC32C256BYTES((P), 1) \ + CRC32C256BYTES((P), 2) \ + CRC32C256BYTES((P), 3) \ + (P) += 4*SEGMENTBYTES; \ + } while(0) + +#endif // defined(LEVELDB_PLATFORM_POSIX_ARMV8_CRC_CRYPTO) + +namespace leveldb { +namespace port { + +static inline bool CanAccelerateCRC32C() { + unsigned long hwcap = getauxval(AT_HWCAP); + if ((hwcap & HWCAP_CRC32) && (hwcap & HWCAP_PMULL)) { + return true; + } + return false; +} + +uint32_t AcceleratedCRC32C(uint32_t crc, const char* buf, size_t size) { +#if !defined(LEVELDB_PLATFORM_POSIX_ARMV8_CRC_CRYPTO) + return 0; +#else + static bool can = CanAccelerateCRC32C(); + if(!can) { + return 0; + } + + int64_t length = size; + uint32_t crc0, crc1, crc2, crc3; + uint64_t t0, t1, t2; + + // k0=CRC(x^(3*SEGMENTBYTES*8)), k1=CRC(x^(2*SEGMENTBYTES*8)), k2=CRC(x^(SEGMENTBYTES*8)) + const poly64_t k0 = 0x8d96551c, k1 = 0xbd6f81f8, k2 = 0xdcb17aa4; + + crc = crc ^ 0xffffffffu; + const uint8_t *p = reinterpret_cast(buf); + + while ( length >= KBYTES) { + crc0 = crc; + crc1 = 0; + crc2 = 0; + crc3 = 0; + + // compute 1024bytes parallelly + CRC32C1024BYTES(p); + + // merge crc0 crc1 crc2 crc3 + t2 = (uint64_t)vmull_p64(crc2, k2); + t1 = (uint64_t)vmull_p64(crc1, k1); + t0 = (uint64_t)vmull_p64(crc0, k0); + crc = __crc32cd(crc3, *(uint64_t *)p); + p += sizeof(uint64_t); + crc ^= __crc32cd(0, t2); + crc ^= __crc32cd(0, t1); + crc ^= __crc32cd(0, t0); + + length -= KBYTES; + } + + while(length >= sizeof(uint64_t)) { + crc = __crc32cd(crc, *(uint64_t *)p); + p += sizeof(uint64_t); + length -= sizeof(uint64_t); + } + + if(length & sizeof(uint32_t)) { + crc = __crc32cw(crc, *(uint32_t *)p); + p += sizeof(uint32_t); + } + + if(length & sizeof(uint16_t)) { + crc = __crc32ch(crc, *(uint16_t *)p); + p += sizeof(uint16_t); + } + + if(length & sizeof(uint8_t)) { + crc = __crc32cb(crc, *p); + } + + return crc ^ 0xffffffffu; + +#endif // defined(LEVELDB_PLATFORM_POSIX_ARMV8_CRC_CRYPTO) +} + +} // namespace port +} // namespace leveldb