From b69ea6e42e87326af30a55bf344642931755ec63 Mon Sep 17 00:00:00 2001 From: Fabian Druschke Date: Sat, 7 Sep 2024 23:15:58 +0200 Subject: [PATCH 1/2] Added and optimized RC4 PRNG to generate 4096-byte random blocks using AVX2 and SSE 4.2 for improved performance. --- configure.ac | 96 +++++++++++++++------------- src/Makefile.am | 2 +- src/gui.c | 44 ++++++++++++- src/options.c | 14 ++++- src/prng.c | 153 +++++++++++++++++++++++++++++++++++++++++++++ src/prng.h | 7 +++ src/rc4/rc4_prng.c | 151 ++++++++++++++++++++++++++++++++++++++++++++ src/rc4/rc4_prng.h | 50 +++++++++++++++ 8 files changed, 471 insertions(+), 46 deletions(-) create mode 100644 src/rc4/rc4_prng.c create mode 100644 src/rc4/rc4_prng.h diff --git a/configure.ac b/configure.ac index 2720d3e9..b4c5ec44 100644 --- a/configure.ac +++ b/configure.ac @@ -16,55 +16,55 @@ PKG_PROG_PKG_CONFIG # Checks for libraries. PKG_CHECK_MODULES( - [PANEL], - [panel], - [ - CFLAGS="${CFLAGS} ${PANEL_CFLAGS}" - LIBS="${LIBS} ${PANEL_LIBS}" - ], - [AC_CHECK_LIB([panel], [main], [ - LIBS="-lpanel $LIBS" - AC_CHECK_HEADERS(panel.h,, [ - AC_CHECK_HEADERS(ncurses/panel.h, [ - AC_DEFINE([PANEL_IN_SUBDIR], [ncurses/], [Look for ncurses headers in subdir]) - ], [AC_MSG_ERROR([ncurses panel headers not found])]) - ]) - ], [AC_MSG_ERROR([ncurses panel library not found])])] + [PANEL], + [panel], + [ + CFLAGS="${CFLAGS} ${PANEL_CFLAGS}" + LIBS="${LIBS} ${PANEL_LIBS}" + ], + [AC_CHECK_LIB([panel], [main], [ + LIBS="-lpanel $LIBS" + AC_CHECK_HEADERS(panel.h,, [ + AC_CHECK_HEADERS(ncurses/panel.h, [ + AC_DEFINE([PANEL_IN_SUBDIR], [ncurses/], [Look for ncurses headers in subdir]) + ], [AC_MSG_ERROR([ncurses panel headers not found])]) + ]) + ], [AC_MSG_ERROR([ncurses panel library not found])])] ) PKG_CHECK_MODULES( - [NCURSES], - [ncurses], - [ - CFLAGS="${CFLAGS} ${NCURSES_CFLAGS}" - LIBS="${LIBS} ${NCURSES_LIBS}" - ], - [AC_CHECK_LIB([ncurses], [delscreen], [ - LIBS="-lncurses $LIBS" - AC_CHECK_HEADERS(ncurses.h,, [ - AC_CHECK_HEADERS(ncurses/ncurses.h, [ - AC_DEFINE([NCURSES_IN_SUBDIR], [ncurses/], [Look for ncurses headers in subdir]) - ], [AC_MSG_ERROR([ncurses headers not found])]) - ]) - ], [AC_MSG_ERROR([ncurses development library not found])] - )] + [NCURSES], + [ncurses], + [ + CFLAGS="${CFLAGS} ${NCURSES_CFLAGS}" + LIBS="${LIBS} ${NCURSES_LIBS}" + ], + [AC_CHECK_LIB([ncurses], [delscreen], [ + LIBS="-lncurses $LIBS" + AC_CHECK_HEADERS(ncurses.h,, [ + AC_CHECK_HEADERS(ncurses/ncurses.h, [ + AC_DEFINE([NCURSES_IN_SUBDIR], [ncurses/], [Look for ncurses headers in subdir]) + ], [AC_MSG_ERROR([ncurses headers not found])]) + ]) + ], [AC_MSG_ERROR([ncurses development library not found])] + )] ) PKG_CHECK_MODULES( - [LIBCONFIG], - [libconfig], - [ - CFLAGS="${CFLAGS} ${LIBCONFIG_CFLAGS}" - LIBS="${LIBS} ${LIBCONFIG_LIBS}" - ], - [AC_CHECK_LIB([libconfig], [main], [ - LIBS="-llibconfig $LIBS" - AC_CHECK_HEADERS(libconfig.h,, [ - AC_CHECK_HEADERS(libconfig.h, [ - AC_DEFINE([LIBCONFIG_IN_SUBDIR], [libconfig/], [Look for libconfig headers in subdir]) - ], [AC_MSG_ERROR([libconfig headers not found])]) - ]) - ], [AC_MSG_ERROR([libconfig library not found])])] + [LIBCONFIG], + [libconfig], + [ + CFLAGS="${CFLAGS} ${LIBCONFIG_CFLAGS}" + LIBS="${LIBS} ${LIBCONFIG_LIBS}" + ], + [AC_CHECK_LIB([libconfig], [main], [ + LIBS="-llibconfig $LIBS" + AC_CHECK_HEADERS(libconfig.h,, [ + AC_CHECK_HEADERS(libconfig.h, [ + AC_DEFINE([LIBCONFIG_IN_SUBDIR], [libconfig/], [Look for libconfig headers in subdir]) + ], [AC_MSG_ERROR([libconfig headers not found])]) + ]) + ], [AC_MSG_ERROR([libconfig library not found])])] ) AC_CHECK_LIB([intl], [libintl_dgettext]) # needed to statically link libparted, but not given in its pkgconfig file @@ -83,4 +83,14 @@ AC_CHECK_MEMBERS([struct stat.st_blksize]) AC_FUNC_MALLOC AC_CHECK_FUNCS([fdatasync memset regcomp strdup strerror]) +# Check if AVX2 is supported +AC_MSG_CHECKING([whether the compiler supports AVX2]) +AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]], [[__m256i test = _mm256_set1_epi32(0);]])], + [CFLAGS="${CFLAGS} -mavx2" + AC_DEFINE([HAVE_AVX2], [1], [Define if AVX2 is supported]) + AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + AC_OUTPUT + diff --git a/src/Makefile.am b/src/Makefile.am index ac652c91..cead7155 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -6,5 +6,5 @@ AM_LDFLAGS = # this lists the binaries to produce, the (non-PHONY, binary) targets in # the previous manual Makefile bin_PROGRAMS = nwipe -nwipe_SOURCES = context.h logging.h options.h prng.h version.h temperature.h nwipe.c gui.c method.h pass.c device.c gui.h isaac_rand/isaac_standard.h isaac_rand/isaac_rand.h isaac_rand/isaac_rand.c isaac_rand/isaac64.h isaac_rand/isaac64.c mt19937ar-cok/mt19937ar-cok.c nwipe.h mt19937ar-cok/mt19937ar-cok.h alfg/add_lagg_fibonacci_prng.h alfg/add_lagg_fibonacci_prng.c xor/xoroshiro256_prng.h xor/xoroshiro256_prng.c pass.h device.h logging.c method.c options.c prng.c version.c temperature.c PDFGen/pdfgen.h PDFGen/pdfgen.c create_pdf.c create_pdf.h embedded_images/shred_db.jpg.c embedded_images/shred_db.jpg.h embedded_images/tick_erased.jpg.c embedded_images/tick_erased.jpg.h embedded_images/redcross.c embedded_images/redcross.h hpa_dco.h hpa_dco.c miscellaneous.h miscellaneous.c embedded_images/nwipe_exclamation.jpg.h embedded_images/nwipe_exclamation.jpg.c conf.h conf.c customers.h customers.c hddtemp_scsi/hddtemp.h hddtemp_scsi/scsi.h hddtemp_scsi/scsicmds.h hddtemp_scsi/get_scsi_temp.c hddtemp_scsi/scsi.c hddtemp_scsi/scsicmds.c +nwipe_SOURCES = context.h logging.h options.h prng.h version.h temperature.h nwipe.c gui.c method.h pass.c device.c gui.h rc4/rc4_prng.h rc4/rc4_prng.c isaac_rand/isaac_standard.h isaac_rand/isaac_rand.h isaac_rand/isaac_rand.c isaac_rand/isaac64.h isaac_rand/isaac64.c mt19937ar-cok/mt19937ar-cok.c nwipe.h mt19937ar-cok/mt19937ar-cok.h alfg/add_lagg_fibonacci_prng.h alfg/add_lagg_fibonacci_prng.c xor/xoroshiro256_prng.h xor/xoroshiro256_prng.c pass.h device.h logging.c method.c options.c prng.c version.c temperature.c PDFGen/pdfgen.h PDFGen/pdfgen.c create_pdf.c create_pdf.h embedded_images/shred_db.jpg.c embedded_images/shred_db.jpg.h embedded_images/tick_erased.jpg.c embedded_images/tick_erased.jpg.h embedded_images/redcross.c embedded_images/redcross.h hpa_dco.h hpa_dco.c miscellaneous.h miscellaneous.c embedded_images/nwipe_exclamation.jpg.h embedded_images/nwipe_exclamation.jpg.c conf.h conf.c customers.h customers.c hddtemp_scsi/hddtemp.h hddtemp_scsi/scsi.h hddtemp_scsi/scsicmds.h hddtemp_scsi/get_scsi_temp.c hddtemp_scsi/scsi.c hddtemp_scsi/scsicmds.c nwipe_LDADD = $(PARTED_LIBS) $(LIBCONFIG) diff --git a/src/gui.c b/src/gui.c index b8fa735b..86cf086b 100644 --- a/src/gui.c +++ b/src/gui.c @@ -1616,11 +1616,12 @@ void nwipe_gui_prng( void ) extern nwipe_prng_t nwipe_aes_ctr_prng; extern nwipe_prng_t nwipe_xoroshiro256_prng; extern nwipe_prng_t nwipe_add_lagg_fibonacci_prng; + extern nwipe_prng_t nwipe_rc4_prng; extern int terminate_signal; /* The number of implemented PRNGs. */ - const int count = 5; + const int count = 6; /* The first tabstop. */ const int tab1 = 2; @@ -1662,6 +1663,10 @@ void nwipe_gui_prng( void ) { focus = 4; } + if( nwipe_options.prng == &nwipe_rc4_prng ) + { + focus = 5; + } do { /* Clear the main window. */ @@ -1678,6 +1683,7 @@ void nwipe_gui_prng( void ) mvwprintw( main_window, yy++, tab1, " %s", nwipe_isaac64.label ); mvwprintw( main_window, yy++, tab1, " %s", nwipe_add_lagg_fibonacci_prng.label ); mvwprintw( main_window, yy++, tab1, " %s", nwipe_xoroshiro256_prng.label ); + mvwprintw( main_window, yy++, tab1, " %s", nwipe_rc4_prng.label ); yy++; /* Print the cursor. */ @@ -1852,6 +1858,38 @@ void nwipe_gui_prng( void ) tab1, "especially for legacy systems, due to its efficiency and minimal demands. " ); break; + + case 5: + + mvwprintw( main_window, + yy++, + tab1, + "RC4, originally designed by Ron Rivest, is a widely used symmetric stream cipher " ); + mvwprintw( main_window, + yy++, + tab1, + "algorithm that can also function as a pseudo-random number generator (PRNG). " ); + mvwprintw( main_window, + yy++, + tab1, + "Although it was primarily intended for encryption, RC4 has been adapted for various " ); + mvwprintw( main_window, + yy++, + tab1, + "applications that require random number generation. The algorithm features a variable " ); + mvwprintw( main_window, + yy++, + tab1, + "key length and generates numbers in a fast, byte-oriented manner. It is suitable for " ); + mvwprintw( main_window, + yy++, + tab1, + "scenarios requiring simplicity and speed, but newer PRNGs may offer better randomness " ); + mvwprintw( main_window, + yy++, + tab1, + "for cryptographic purposes. " ); + break; } /* switch */ @@ -1922,6 +1960,10 @@ void nwipe_gui_prng( void ) { nwipe_options.prng = &nwipe_xoroshiro256_prng; } + if( focus == 5 ) + { + nwipe_options.prng = &nwipe_rc4_prng; + } return; case KEY_BACKSPACE: diff --git a/src/options.c b/src/options.c index c855d0e2..7a1a20e9 100644 --- a/src/options.c +++ b/src/options.c @@ -44,6 +44,7 @@ int nwipe_options_parse( int argc, char** argv ) extern nwipe_prng_t nwipe_isaac64; extern nwipe_prng_t nwipe_add_lagg_fibonacci_prng; extern nwipe_prng_t nwipe_xoroshiro256_prng; + extern nwipe_prng_t nwipe_rc4_prng; /* The getopt() result holder. */ int nwipe_opt; @@ -503,6 +504,11 @@ int nwipe_options_parse( int argc, char** argv ) nwipe_options.prng = &nwipe_xoroshiro256_prng; break; } + if( strcmp( optarg, "rc4_prng" ) == 0 ) + { + nwipe_options.prng = &nwipe_rc4_prng; + break; + } /* Else we do not know this PRNG. */ fprintf( stderr, "Error: Unknown prng '%s'.\n", optarg ); @@ -554,6 +560,7 @@ void nwipe_options_log( void ) extern nwipe_prng_t nwipe_isaac64; extern nwipe_prng_t nwipe_add_lagg_fibonacci_prng; extern nwipe_prng_t nwipe_xoroshiro256_prng; + extern nwipe_prng_t nwipe_rc4_prng; /** * Prints a manifest of options to the log. @@ -623,6 +630,11 @@ void nwipe_options_log( void ) { nwipe_log( NWIPE_LOG_NOTICE, " prng = Isaac" ); } + if( nwipe_options.prng == &nwipe_rc4_prng ) + { + nwipe_log( NWIPE_LOG_NOTICE, " prng = RC4" ); + } + else { if( nwipe_options.prng == &nwipe_isaac64 ) @@ -714,7 +726,7 @@ void display_help() puts( " -l, --logfile=FILE Filename to log to. Default is STDOUT\n" ); puts( " -P, --PDFreportpath=PATH Path to write PDF reports to. Default is \".\"" ); puts( " If set to \"noPDF\" no PDF reports are written.\n" ); - puts( " -p, --prng=METHOD PRNG option (mersenne|twister|isaac|isaac64|add_lagg_fibonacci_prng)\n" ); + puts( " -p, --prng=METHOD PRNG option (mersenne|twister|isaac|isaac64|add_lagg_fibonacci_prng|rc4_prng)\n" ); puts( " -q, --quiet Anonymize logs and the GUI by removing unique data, i.e." ); puts( " serial numbers, LU WWN Device ID, and SMBIOS/DMI data" ); puts( " XXXXXX = S/N exists, ????? = S/N not obtainable\n" ); diff --git a/src/prng.c b/src/prng.c index abf1b6cc..d90c4971 100644 --- a/src/prng.c +++ b/src/prng.c @@ -21,12 +21,14 @@ #include "prng.h" #include "context.h" #include "logging.h" +#include #include "mt19937ar-cok/mt19937ar-cok.h" #include "isaac_rand/isaac_rand.h" #include "isaac_rand/isaac64.h" #include "alfg/add_lagg_fibonacci_prng.h" //Lagged Fibonacci generator prototype #include "xor/xoroshiro256_prng.h" //XORoshiro-256 prototype +#include "rc4/rc4_prng.h" //RC4 protoype nwipe_prng_t nwipe_twister = { "Mersenne Twister (mt19937ar-cok)", nwipe_twister_init, nwipe_twister_read }; @@ -40,6 +42,82 @@ nwipe_prng_t nwipe_add_lagg_fibonacci_prng = { "Lagged Fibonacci generator", /* XOROSHIRO-256 PRNG Structure */ nwipe_prng_t nwipe_xoroshiro256_prng = { "XORoshiro-256", nwipe_xoroshiro256_prng_init, nwipe_xoroshiro256_prng_read }; +/* RC4 PRNG Structure */ +nwipe_prng_t nwipe_rc4_prng = { "RC4", nwipe_rc4_prng_init, nwipe_rc4_prng_read }; + +// Function to check CPUID and test if SSE4.1 is supported +#if defined( _MSC_VER ) +#include // For MSVC compilers to use __cpuid +#else +#include // For GCC/Clang compilers to use __cpuid +#endif + +/** + * Function to check if the CPU supports SSE 4.1. + * + * @return 1 if SSE 4.1 is supported, 0 otherwise. + */ +int cpu_supports_sse41() +{ + unsigned int eax, ebx, ecx, edx; + +#if defined( _MSC_VER ) + int cpuInfo[4]; + // Call __cpuid with eax = 1 to get feature information + __cpuid( cpuInfo, 1 ); + ecx = cpuInfo[2]; // ECX register contains feature bits +#else + // Use GCC/Clang __cpuid function + __cpuid( 1, eax, ebx, ecx, edx ); +#endif + + // SSE 4.1 is indicated by bit 19 in the ECX register. + // If this bit is set, the CPU supports SSE 4.1. + return ( ecx & ( 1 << 19 ) ) != 0; +} + +/** + * Function to check if the CPU supports AVX2. + * + * @return 1 if AVX2 is supported, 0 otherwise. + */ +int cpu_supports_avx2() +{ + unsigned int eax, ebx, ecx, edx; + +#if defined( _MSC_VER ) + int cpuInfo[4]; + // Call __cpuid with eax = 1 to get basic feature information + __cpuid( cpuInfo, 1 ); + ecx = cpuInfo[2]; // ECX contains basic feature bits + + // Check if the CPU supports AVX (bit 28 of ECX in leaf 1) + if( !( ecx & ( 1 << 28 ) ) ) + { + return 0; // AVX is not supported, so AVX2 can't be supported + } + + // Call __cpuid with eax = 7 and ecx = 0 to get extended features + __cpuid( cpuInfo, 7 ); + ebx = cpuInfo[1]; // EBX contains extended feature bits +#else + // Use GCC/Clang __cpuid for basic feature information (leaf 1) + __cpuid( 1, eax, ebx, ecx, edx ); + + // Check if the CPU supports AVX (bit 28 of ECX in leaf 1) + if( !( ecx & ( 1 << 28 ) ) ) + { + return 0; // AVX is not supported, so AVX2 can't be supported + } + + // Use GCC/Clang __cpuid for extended feature information (leaf 7) + __cpuid_count( 7, 0, eax, ebx, ecx, edx ); +#endif + + // AVX2 is indicated by bit 5 in the EBX register of CPUID leaf 7. + return ( ebx & ( 1 << 5 ) ) != 0; +} + /* Print given number of bytes from unsigned integer number to a byte stream buffer starting with low-endian. */ static inline void u32_to_buffer( u8* restrict buffer, u32 val, const int len ) { @@ -340,3 +418,78 @@ int nwipe_xoroshiro256_prng_read( NWIPE_PRNG_READ_SIGNATURE ) return 0; // Success } + +int nwipe_rc4_prng_init( NWIPE_PRNG_INIT_SIGNATURE ) +{ + nwipe_log( NWIPE_LOG_NOTICE, "Initialising RC4 PRNG" ); + + if( *state == NULL ) + { + /* This is the first time that we have been called. */ + *state = malloc( sizeof( rc4_state_t ) ); + } + rc4_init( (rc4_state_t*) *state, (uint64_t*) ( seed->s ), seed->length / sizeof( uint64_t ) ); + + return 0; +} + +int nwipe_rc4_prng_read( NWIPE_PRNG_READ_SIGNATURE ) +{ + u8* restrict bufpos = buffer; + size_t words = count / SIZE_OF_RC4_PRNG; // SIZE_OF_RC4_PRNG is 4096 bytes + + // Check if the CPU supports AVX2 first + int use_avx2 = cpu_supports_avx2(); + + // Check if the CPU supports SSE4.1 + int use_sse4 = cpu_supports_sse41(); + + /* Loop to fill the buffer with blocks directly from the RC4 algorithm */ + for( size_t ii = 0; ii < words; ++ii ) + { + if( use_avx2 ) + { + // Use AVX2-optimized version + rc4_avx2_genrand( (rc4_state_t*) *state, bufpos ); + } + else if( use_sse4 ) + { + // Use SSE4-optimized version + rc4_sse4_genrand( (rc4_state_t*) *state, bufpos ); + } + else + { + // Fallback to generic version + rc4_genrand_4096_to_buf( (rc4_state_t*) *state, bufpos ); + } + bufpos += SIZE_OF_RC4_PRNG; // Move to the next block + } + + /* Handle remaining bytes if count is not a multiple of SIZE_OF_RC4_PRNG */ + const size_t remain = count % SIZE_OF_RC4_PRNG; + if( remain > 0 ) + { + unsigned char temp_output[SIZE_OF_RC4_PRNG]; // Temporary buffer for the last block + + if( use_avx2 ) + { + // Use AVX2-optimized version + rc4_avx2_genrand( (rc4_state_t*) *state, temp_output ); + } + else if( use_sse4 ) + { + // Use SSE4-optimized version + rc4_sse4_genrand( (rc4_state_t*) *state, temp_output ); + } + else + { + // Fallback to generic version + rc4_genrand_4096_to_buf( (rc4_state_t*) *state, temp_output ); + } + + // Copy the remaining bytes + memcpy( bufpos, temp_output, remain ); + } + + return 0; // Success +} diff --git a/src/prng.h b/src/prng.h index a9add099..461d6321 100644 --- a/src/prng.h +++ b/src/prng.h @@ -63,6 +63,10 @@ int nwipe_add_lagg_fibonacci_prng_read( NWIPE_PRNG_READ_SIGNATURE ); int nwipe_xoroshiro256_prng_init( NWIPE_PRNG_INIT_SIGNATURE ); int nwipe_xoroshiro256_prng_read( NWIPE_PRNG_READ_SIGNATURE ); +/* RC4 prototypes. */ +int nwipe_rc4_prng_init( NWIPE_PRNG_INIT_SIGNATURE ); +int nwipe_rc4_prng_read( NWIPE_PRNG_READ_SIGNATURE ); + /* Size of the twister is not derived from the architecture, but it is strictly 4 bytes */ #define SIZE_OF_TWISTER 4 @@ -76,4 +80,7 @@ int nwipe_xoroshiro256_prng_read( NWIPE_PRNG_READ_SIGNATURE ); /* Size of the XOROSHIRO-256 is not derived from the architecture, but it is strictly 32 bytes */ #define SIZE_OF_XOROSHIRO256_PRNG 32 +/* Size of the RC4 is not derived from the architecture, but it is strictly 4096 bytes */ +#define SIZE_OF_RC4_PRNG 4096 + #endif /* PRNG_H_ */ diff --git a/src/rc4/rc4_prng.c b/src/rc4/rc4_prng.c new file mode 100644 index 00000000..e7554660 --- /dev/null +++ b/src/rc4/rc4_prng.c @@ -0,0 +1,151 @@ +/* + * RC4 PRNG Implementation (Optimized with AVX2 for nwipe) + * Original RC4 Algorithm Author: Ron Rivest (1987) + * Adaptation Author: Fabian Druschke + * Date: 2024-09-07 + * + * This version of the RC4 PRNG is optimized using AVX2 instructions for enhanced performance. + * It generates pseudorandom data for the nwipe project and is not intended for cryptographic purposes. + * + * Disclaimer: This software is provided "as is", without warranty of any kind, express or implied. + */ + +#include "rc4_prng.h" +#include +#include +#include // For AVX2 support + +// Initialize the RC4 key +void rc4_init( rc4_state_t* state, uint64_t init_key[], unsigned long key_length ) +{ + int i, j = 0; + unsigned char k[RC4_KEY_LENGTH]; + + // Convert init_key into a byte array (k) and fill the rest if key_length is smaller + for( i = 0; i < RC4_KEY_LENGTH; i++ ) + { + if( i < key_length * sizeof( uint64_t ) ) + { + k[i] = ( (unsigned char*) init_key )[i]; + } + else + { + // Fallback in case of insufficient key length + k[i] = k[i - 1] * 6364136223846793005ULL + 1; + } + } + + // Initialize the S-Box with identity permutation + for( i = 0; i < RC4_KEY_LENGTH; i++ ) + { + state->S[i] = i; + } + + // Permute the S-Box based on the key + for( i = 0; i < RC4_KEY_LENGTH; i++ ) + { + j = ( j + state->S[i] + k[i] ) % RC4_KEY_LENGTH; + unsigned char temp = state->S[i]; + state->S[i] = state->S[j]; + state->S[j] = temp; + } + + state->i = 0; + state->j = 0; +} + +// Generate 4096 random bytes and write them into the buffer bufpos +void rc4_genrand_4096_to_buf( rc4_state_t* state, unsigned char* bufpos ) +{ + unsigned char temp; + unsigned char temp_buffer[OUTPUT_DATA_LENGTH]; // Temporary buffer + + // Loop unrolling and prefetching for performance optimization + unsigned long n; + for( n = 0; n < OUTPUT_DATA_LENGTH; n += 4 ) + { + _mm_prefetch( (const char*) &state->S[state->i + 16], _MM_HINT_T0 ); + + // Generate the next 4 bytes + for( int i = 0; i < 4; i++ ) + { + state->i = ( state->i + 1 ) % RC4_KEY_LENGTH; + state->j = ( state->j + state->S[state->i] ) % RC4_KEY_LENGTH; + temp = state->S[state->i]; + state->S[state->i] = state->S[state->j]; + state->S[state->j] = temp; + temp_buffer[n + i] = state->S[( state->S[state->i] + state->S[state->j] ) % RC4_KEY_LENGTH]; + } + } + + memcpy( bufpos, temp_buffer, OUTPUT_DATA_LENGTH ); +} + +// Generate 4096 random bytes and write them into the buffer bufpos with SSE 4.2 +void rc4_sse4_genrand( rc4_state_t* state, unsigned char* bufpos ) +{ + unsigned char temp; + unsigned char temp_buffer[OUTPUT_DATA_LENGTH]; // Temporary buffer for generated random data + + __m128i sse_temp_buffer; // 128-bit SIMD register for storing 16 bytes of data at a time + unsigned long n; + + // Loop through the output buffer in blocks of 16 bytes for SSE4 processing + for( n = 0; n < OUTPUT_DATA_LENGTH; n += 16 ) + { + // Prefetch the next block of memory to improve cache performance + _mm_prefetch( (const char*) &state->S[state->i + 16], _MM_HINT_T0 ); + + // Process 16 bytes at a time using SSE4 support + for( int i = 0; i < 16; i += 4 ) + { + // Update the 'i' index for the RC4 algorithm, wrapping around at the key length + state->i = ( state->i + 1 ) % RC4_KEY_LENGTH; + + // Update the 'j' index for the RC4 algorithm, wrapping around at the key length + state->j = ( state->j + state->S[state->i] ) % RC4_KEY_LENGTH; + + // Swap state bytes based on the updated indices + temp = state->S[state->i]; + state->S[state->i] = state->S[state->j]; + state->S[state->j] = temp; + + // Generate the next random byte using the RC4 algorithm + temp_buffer[n + i] = state->S[( state->S[state->i] + state->S[state->j] ) % RC4_KEY_LENGTH]; + } + + // Use SSE4 instructions to copy the generated 16 bytes in parallel + sse_temp_buffer = _mm_loadu_si128( (__m128i*) &temp_buffer[n] ); + _mm_storeu_si128( (__m128i*) ( bufpos + n ), sse_temp_buffer ); + } +} + +// AVX2-optimized version for parallel byte generation +void rc4_avx2_genrand( rc4_state_t* state, unsigned char* bufpos ) +{ + unsigned char temp; + unsigned char temp_buffer[OUTPUT_DATA_LENGTH]; // Temporary buffer + + __m256i avx2_temp_buffer; + unsigned long n; + + for( n = 0; n < OUTPUT_DATA_LENGTH; n += 32 ) + { + _mm_prefetch( (const char*) &state->S[state->i + 32], _MM_HINT_T0 ); + + // Process 32 bytes at a time (using AVX2) + for( int i = 0; i < 32; i += 4 ) + { + state->i = ( state->i + 1 ) % RC4_KEY_LENGTH; + state->j = ( state->j + state->S[state->i] ) % RC4_KEY_LENGTH; + temp = state->S[state->i]; + state->S[state->i] = state->S[state->j]; + state->S[state->j] = temp; + temp_buffer[n + i] = state->S[( state->S[state->i] + state->S[state->j] ) % RC4_KEY_LENGTH]; + } + + // AVX2: Load and store the generated bytes in parallel + avx2_temp_buffer = _mm256_loadu_si256( (__m256i*) &temp_buffer[n] ); + _mm256_storeu_si256( (__m256i*) ( bufpos + n ), avx2_temp_buffer ); + } +} diff --git a/src/rc4/rc4_prng.h b/src/rc4/rc4_prng.h new file mode 100644 index 00000000..e15053f6 --- /dev/null +++ b/src/rc4/rc4_prng.h @@ -0,0 +1,50 @@ +/* + * RC4 PRNG Header File + * Author: [Your Name] + * Date: 2024-09-07 + * + * This header file provides function declarations and data structures for the + * RC4-based pseudorandom number generator implementation. The RC4 algorithm + * is not suitable for cryptographic purposes but can be used for non-secure + * pseudorandom data generation. + * + * As the author of this header file, I, [Your Name], hereby release this work into + * the public domain. I dedicate any and all copyright interest in this work to the public + * domain, making it free to use for anyone for any purpose without any conditions, unless + * such conditions are required by law. + * + * This software is provided "as is", without warranty of any kind, express or implied, + * including but not limited to the warranties of merchantability, fitness for a particular + * purpose, and noninfringement. In no event shall the authors be liable for any claim, + * damages, or other liability, whether in an action of contract, tort, or otherwise, arising + * from, out of, or in connection with the software or the use or other dealings in the software. + */ + +#ifndef RC4_PRNG_H +#define RC4_PRNG_H + +#include + +// Constants +#define RC4_KEY_LENGTH 256 // Size of the S-Box +#define OUTPUT_DATA_LENGTH 4096 // Amount of random data to generate (4096 bytes) + +// RC4 key structure to hold the S-Box and indices +typedef struct rc4_state_s +{ + unsigned char S[RC4_KEY_LENGTH]; // S-Box (Permutation table) + unsigned char i, j; // Indices for the key scheduling +} rc4_state_t; + +// Function to initialize the RC4 key with the given key material +// init_key: The initial key used to seed the RC4 PRNG +// key_length: The length of the init_key in 64-bit blocks +void rc4_init( rc4_state_t* state, uint64_t init_key[], unsigned long key_length ); + +// Function to generate 4096 random bytes and write them into the provided buffer +// bufpos: The buffer where the generated random bytes will be written +void rc4_genrand_4096_to_buf( rc4_state_t* state, unsigned char* bufpos ); +void rc4_sse4_genrand( rc4_state_t* state, unsigned char* bufpos ); +void rc4_avx2_genrand( rc4_state_t* state, unsigned char* bufpos ); + +#endif // RC4_PRNG_H From 65468f8e57b0857f61f04bd60828506fed163fcf Mon Sep 17 00:00:00 2001 From: Fabian Druschke Date: Sat, 7 Sep 2024 23:31:38 +0200 Subject: [PATCH 2/2] Commented sections for AVX2 support for RC4 due to issues with configure.ac, during check for AVX2 support in the compiler, until it's fixed. --- configure.ac | 96 +++++++++++++++++++++------------------------- src/prng.c | 72 ++++++++++++++++++---------------- src/rc4/rc4_prng.c | 3 +- src/rc4/rc4_prng.h | 2 + 4 files changed, 86 insertions(+), 87 deletions(-) diff --git a/configure.ac b/configure.ac index b4c5ec44..2720d3e9 100644 --- a/configure.ac +++ b/configure.ac @@ -16,55 +16,55 @@ PKG_PROG_PKG_CONFIG # Checks for libraries. PKG_CHECK_MODULES( - [PANEL], - [panel], - [ - CFLAGS="${CFLAGS} ${PANEL_CFLAGS}" - LIBS="${LIBS} ${PANEL_LIBS}" - ], - [AC_CHECK_LIB([panel], [main], [ - LIBS="-lpanel $LIBS" - AC_CHECK_HEADERS(panel.h,, [ - AC_CHECK_HEADERS(ncurses/panel.h, [ - AC_DEFINE([PANEL_IN_SUBDIR], [ncurses/], [Look for ncurses headers in subdir]) - ], [AC_MSG_ERROR([ncurses panel headers not found])]) - ]) - ], [AC_MSG_ERROR([ncurses panel library not found])])] + [PANEL], + [panel], + [ + CFLAGS="${CFLAGS} ${PANEL_CFLAGS}" + LIBS="${LIBS} ${PANEL_LIBS}" + ], + [AC_CHECK_LIB([panel], [main], [ + LIBS="-lpanel $LIBS" + AC_CHECK_HEADERS(panel.h,, [ + AC_CHECK_HEADERS(ncurses/panel.h, [ + AC_DEFINE([PANEL_IN_SUBDIR], [ncurses/], [Look for ncurses headers in subdir]) + ], [AC_MSG_ERROR([ncurses panel headers not found])]) + ]) + ], [AC_MSG_ERROR([ncurses panel library not found])])] ) PKG_CHECK_MODULES( - [NCURSES], - [ncurses], - [ - CFLAGS="${CFLAGS} ${NCURSES_CFLAGS}" - LIBS="${LIBS} ${NCURSES_LIBS}" - ], - [AC_CHECK_LIB([ncurses], [delscreen], [ - LIBS="-lncurses $LIBS" - AC_CHECK_HEADERS(ncurses.h,, [ - AC_CHECK_HEADERS(ncurses/ncurses.h, [ - AC_DEFINE([NCURSES_IN_SUBDIR], [ncurses/], [Look for ncurses headers in subdir]) - ], [AC_MSG_ERROR([ncurses headers not found])]) - ]) - ], [AC_MSG_ERROR([ncurses development library not found])] - )] + [NCURSES], + [ncurses], + [ + CFLAGS="${CFLAGS} ${NCURSES_CFLAGS}" + LIBS="${LIBS} ${NCURSES_LIBS}" + ], + [AC_CHECK_LIB([ncurses], [delscreen], [ + LIBS="-lncurses $LIBS" + AC_CHECK_HEADERS(ncurses.h,, [ + AC_CHECK_HEADERS(ncurses/ncurses.h, [ + AC_DEFINE([NCURSES_IN_SUBDIR], [ncurses/], [Look for ncurses headers in subdir]) + ], [AC_MSG_ERROR([ncurses headers not found])]) + ]) + ], [AC_MSG_ERROR([ncurses development library not found])] + )] ) PKG_CHECK_MODULES( - [LIBCONFIG], - [libconfig], - [ - CFLAGS="${CFLAGS} ${LIBCONFIG_CFLAGS}" - LIBS="${LIBS} ${LIBCONFIG_LIBS}" - ], - [AC_CHECK_LIB([libconfig], [main], [ - LIBS="-llibconfig $LIBS" - AC_CHECK_HEADERS(libconfig.h,, [ - AC_CHECK_HEADERS(libconfig.h, [ - AC_DEFINE([LIBCONFIG_IN_SUBDIR], [libconfig/], [Look for libconfig headers in subdir]) - ], [AC_MSG_ERROR([libconfig headers not found])]) - ]) - ], [AC_MSG_ERROR([libconfig library not found])])] + [LIBCONFIG], + [libconfig], + [ + CFLAGS="${CFLAGS} ${LIBCONFIG_CFLAGS}" + LIBS="${LIBS} ${LIBCONFIG_LIBS}" + ], + [AC_CHECK_LIB([libconfig], [main], [ + LIBS="-llibconfig $LIBS" + AC_CHECK_HEADERS(libconfig.h,, [ + AC_CHECK_HEADERS(libconfig.h, [ + AC_DEFINE([LIBCONFIG_IN_SUBDIR], [libconfig/], [Look for libconfig headers in subdir]) + ], [AC_MSG_ERROR([libconfig headers not found])]) + ]) + ], [AC_MSG_ERROR([libconfig library not found])])] ) AC_CHECK_LIB([intl], [libintl_dgettext]) # needed to statically link libparted, but not given in its pkgconfig file @@ -83,14 +83,4 @@ AC_CHECK_MEMBERS([struct stat.st_blksize]) AC_FUNC_MALLOC AC_CHECK_FUNCS([fdatasync memset regcomp strdup strerror]) -# Check if AVX2 is supported -AC_MSG_CHECKING([whether the compiler supports AVX2]) -AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include ]], [[__m256i test = _mm256_set1_epi32(0);]])], - [CFLAGS="${CFLAGS} -mavx2" - AC_DEFINE([HAVE_AVX2], [1], [Define if AVX2 is supported]) - AC_MSG_RESULT([yes])], - [AC_MSG_RESULT([no])] -) - AC_OUTPUT - diff --git a/src/prng.c b/src/prng.c index d90c4971..19f90918 100644 --- a/src/prng.c +++ b/src/prng.c @@ -81,43 +81,47 @@ int cpu_supports_sse41() * * @return 1 if AVX2 is supported, 0 otherwise. */ + +/* int cpu_supports_avx2() { - unsigned int eax, ebx, ecx, edx; + unsigned int eax, ebx, ecx, edx; #if defined( _MSC_VER ) - int cpuInfo[4]; - // Call __cpuid with eax = 1 to get basic feature information - __cpuid( cpuInfo, 1 ); - ecx = cpuInfo[2]; // ECX contains basic feature bits - - // Check if the CPU supports AVX (bit 28 of ECX in leaf 1) - if( !( ecx & ( 1 << 28 ) ) ) - { - return 0; // AVX is not supported, so AVX2 can't be supported - } - - // Call __cpuid with eax = 7 and ecx = 0 to get extended features - __cpuid( cpuInfo, 7 ); - ebx = cpuInfo[1]; // EBX contains extended feature bits + int cpuInfo[4]; + // Call __cpuid with eax = 1 to get basic feature information + __cpuid( cpuInfo, 1 ); + ecx = cpuInfo[2]; // ECX contains basic feature bits + + // Check if the CPU supports AVX (bit 28 of ECX in leaf 1) + if( !( ecx & ( 1 << 28 ) ) ) + { + return 0; // AVX is not supported, so AVX2 can't be supported + } + + // Call __cpuid with eax = 7 and ecx = 0 to get extended features + __cpuid( cpuInfo, 7 ); + ebx = cpuInfo[1]; // EBX contains extended feature bits #else - // Use GCC/Clang __cpuid for basic feature information (leaf 1) - __cpuid( 1, eax, ebx, ecx, edx ); + // Use GCC/Clang __cpuid for basic feature information (leaf 1) + __cpuid( 1, eax, ebx, ecx, edx ); - // Check if the CPU supports AVX (bit 28 of ECX in leaf 1) - if( !( ecx & ( 1 << 28 ) ) ) - { - return 0; // AVX is not supported, so AVX2 can't be supported - } + // Check if the CPU supports AVX (bit 28 of ECX in leaf 1) + if( !( ecx & ( 1 << 28 ) ) ) + { + return 0; // AVX is not supported, so AVX2 can't be supported + } - // Use GCC/Clang __cpuid for extended feature information (leaf 7) - __cpuid_count( 7, 0, eax, ebx, ecx, edx ); + // Use GCC/Clang __cpuid for extended feature information (leaf 7) + __cpuid_count( 7, 0, eax, ebx, ecx, edx ); #endif - // AVX2 is indicated by bit 5 in the EBX register of CPUID leaf 7. - return ( ebx & ( 1 << 5 ) ) != 0; + // AVX2 is indicated by bit 5 in the EBX register of CPUID leaf 7. + return ( ebx & ( 1 << 5 ) ) != 0; } +*/ + /* Print given number of bytes from unsigned integer number to a byte stream buffer starting with low-endian. */ static inline void u32_to_buffer( u8* restrict buffer, u32 val, const int len ) { @@ -439,7 +443,9 @@ int nwipe_rc4_prng_read( NWIPE_PRNG_READ_SIGNATURE ) size_t words = count / SIZE_OF_RC4_PRNG; // SIZE_OF_RC4_PRNG is 4096 bytes // Check if the CPU supports AVX2 first - int use_avx2 = cpu_supports_avx2(); + /* + int use_avx2 = cpu_supports_avx2(); + */ // Check if the CPU supports SSE4.1 int use_sse4 = cpu_supports_sse41(); @@ -447,12 +453,12 @@ int nwipe_rc4_prng_read( NWIPE_PRNG_READ_SIGNATURE ) /* Loop to fill the buffer with blocks directly from the RC4 algorithm */ for( size_t ii = 0; ii < words; ++ii ) { - if( use_avx2 ) + /*if( use_avx2 ) { // Use AVX2-optimized version rc4_avx2_genrand( (rc4_state_t*) *state, bufpos ); - } - else if( use_sse4 ) + }*/ + if( use_sse4 ) { // Use SSE4-optimized version rc4_sse4_genrand( (rc4_state_t*) *state, bufpos ); @@ -471,12 +477,12 @@ int nwipe_rc4_prng_read( NWIPE_PRNG_READ_SIGNATURE ) { unsigned char temp_output[SIZE_OF_RC4_PRNG]; // Temporary buffer for the last block - if( use_avx2 ) + /*if( use_avx2 ) { // Use AVX2-optimized version rc4_avx2_genrand( (rc4_state_t*) *state, temp_output ); - } - else if( use_sse4 ) + }*/ + if( use_sse4 ) { // Use SSE4-optimized version rc4_sse4_genrand( (rc4_state_t*) *state, temp_output ); diff --git a/src/rc4/rc4_prng.c b/src/rc4/rc4_prng.c index e7554660..fed6eb38 100644 --- a/src/rc4/rc4_prng.c +++ b/src/rc4/rc4_prng.c @@ -121,6 +121,7 @@ void rc4_sse4_genrand( rc4_state_t* state, unsigned char* bufpos ) } // AVX2-optimized version for parallel byte generation +/* void rc4_avx2_genrand( rc4_state_t* state, unsigned char* bufpos ) { unsigned char temp; @@ -148,4 +149,4 @@ void rc4_avx2_genrand( rc4_state_t* state, unsigned char* bufpos ) avx2_temp_buffer = _mm256_loadu_si256( (__m256i*) &temp_buffer[n] ); _mm256_storeu_si256( (__m256i*) ( bufpos + n ), avx2_temp_buffer ); } -} +}*/ diff --git a/src/rc4/rc4_prng.h b/src/rc4/rc4_prng.h index e15053f6..8dcc9fc6 100644 --- a/src/rc4/rc4_prng.h +++ b/src/rc4/rc4_prng.h @@ -45,6 +45,8 @@ void rc4_init( rc4_state_t* state, uint64_t init_key[], unsigned long key_length // bufpos: The buffer where the generated random bytes will be written void rc4_genrand_4096_to_buf( rc4_state_t* state, unsigned char* bufpos ); void rc4_sse4_genrand( rc4_state_t* state, unsigned char* bufpos ); +/* void rc4_avx2_genrand( rc4_state_t* state, unsigned char* bufpos ); +*/ #endif // RC4_PRNG_H