|
| 1 | +//! SHA-512 `x86`/`x86_64` backend |
| 2 | +
|
| 3 | +#![allow(clippy::many_single_char_names)] |
| 4 | + |
| 5 | +use core::mem::size_of; |
| 6 | + |
| 7 | +#[cfg(target_arch = "x86")] |
| 8 | +use core::arch::x86::*; |
| 9 | +#[cfg(target_arch = "x86_64")] |
| 10 | +use core::arch::x86_64::*; |
| 11 | + |
| 12 | +use crate::consts::K64; |
| 13 | + |
| 14 | +cpufeatures::new!(avx2_cpuid, "avx2"); |
| 15 | + |
| 16 | +pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { |
| 17 | + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 |
| 18 | + // after stabilization |
| 19 | + if avx2_cpuid::get() { |
| 20 | + unsafe { |
| 21 | + sha512_compress_x86_64_avx2(state, blocks); |
| 22 | + } |
| 23 | + } else { |
| 24 | + super::soft::compress(state, blocks); |
| 25 | + } |
| 26 | +} |
| 27 | + |
| 28 | +#[target_feature(enable = "avx2")] |
| 29 | +unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) { |
| 30 | + let mut start_block = 0; |
| 31 | + |
| 32 | + if blocks.len() & 0b1 != 0 { |
| 33 | + sha512_compress_x86_64_avx(state, &blocks[0]); |
| 34 | + start_block += 1; |
| 35 | + } |
| 36 | + |
| 37 | + let mut ms: MsgSchedule = Default::default(); |
| 38 | + let mut t2: RoundStates = [0u64; SHA512_ROUNDS_NUM]; |
| 39 | + let mut x = [_mm256_setzero_si256(); 8]; |
| 40 | + |
| 41 | + for i in (start_block..blocks.len()).step_by(2) { |
| 42 | + load_data_avx2(&mut x, &mut ms, &mut t2, blocks.as_ptr().add(i) as *const _); |
| 43 | + |
| 44 | + // First block |
| 45 | + let mut current_state = *state; |
| 46 | + rounds_0_63_avx2(&mut current_state, &mut x, &mut ms, &mut t2); |
| 47 | + rounds_64_79(&mut current_state, &ms); |
| 48 | + accumulate_state(state, ¤t_state); |
| 49 | + |
| 50 | + // Second block |
| 51 | + current_state = *state; |
| 52 | + process_second_block(&mut current_state, &t2); |
| 53 | + accumulate_state(state, ¤t_state); |
| 54 | + } |
| 55 | +} |
| 56 | + |
| 57 | +#[inline(always)] |
| 58 | +unsafe fn sha512_compress_x86_64_avx(state: &mut [u64; 8], block: &[u8; 128]) { |
| 59 | + let mut ms = Default::default(); |
| 60 | + let mut x = [_mm_setzero_si128(); 8]; |
| 61 | + |
| 62 | + // Reduced to single iteration |
| 63 | + let mut current_state = *state; |
| 64 | + load_data_avx(&mut x, &mut ms, block.as_ptr() as *const _); |
| 65 | + rounds_0_63_avx(&mut current_state, &mut x, &mut ms); |
| 66 | + rounds_64_79(&mut current_state, &ms); |
| 67 | + accumulate_state(state, ¤t_state); |
| 68 | +} |
| 69 | + |
| 70 | +#[inline(always)] |
| 71 | +unsafe fn load_data_avx(x: &mut [__m128i; 8], ms: &mut MsgSchedule, data: *const __m128i) { |
| 72 | + #[allow(non_snake_case)] |
| 73 | + let MASK = _mm_setr_epi32(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b); |
| 74 | + |
| 75 | + macro_rules! unrolled_iterations { |
| 76 | + ($($i:literal),*) => {$( |
| 77 | + x[$i] = _mm_loadu_si128(data.add($i) as *const _); |
| 78 | + x[$i] = _mm_shuffle_epi8(x[$i], MASK); |
| 79 | + |
| 80 | + let y = _mm_add_epi64( |
| 81 | + x[$i], |
| 82 | + _mm_loadu_si128(&K64[2 * $i] as *const u64 as *const _), |
| 83 | + ); |
| 84 | + |
| 85 | + _mm_store_si128(&mut ms[2 * $i] as *mut u64 as *mut _, y); |
| 86 | + )*}; |
| 87 | + } |
| 88 | + |
| 89 | + unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); |
| 90 | +} |
| 91 | + |
| 92 | +#[inline(always)] |
| 93 | +unsafe fn load_data_avx2( |
| 94 | + x: &mut [__m256i; 8], |
| 95 | + ms: &mut MsgSchedule, |
| 96 | + t2: &mut RoundStates, |
| 97 | + data: *const __m128i, |
| 98 | +) { |
| 99 | + #[allow(non_snake_case)] |
| 100 | + let MASK = _mm256_set_epi64x( |
| 101 | + 0x0809_0A0B_0C0D_0E0F_i64, |
| 102 | + 0x0001_0203_0405_0607_i64, |
| 103 | + 0x0809_0A0B_0C0D_0E0F_i64, |
| 104 | + 0x0001_0203_0405_0607_i64, |
| 105 | + ); |
| 106 | + |
| 107 | + macro_rules! unrolled_iterations { |
| 108 | + ($($i:literal),*) => {$( |
| 109 | + x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i) as *const _), 1); |
| 110 | + x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i + 1) as *const _), 0); |
| 111 | + |
| 112 | + x[$i] = _mm256_shuffle_epi8(x[$i], MASK); |
| 113 | + |
| 114 | + let t = _mm_loadu_si128(K64.as_ptr().add($i * 2) as *const u64 as *const _); |
| 115 | + let y = _mm256_add_epi64(x[$i], _mm256_set_m128i(t, t)); |
| 116 | + |
| 117 | + _mm_store_si128( |
| 118 | + &mut ms[2 * $i] as *mut u64 as *mut _, |
| 119 | + _mm256_extracti128_si256(y, 0), |
| 120 | + ); |
| 121 | + _mm_store_si128( |
| 122 | + &mut t2[2 * $i] as *mut u64 as *mut _, |
| 123 | + _mm256_extracti128_si256(y, 1), |
| 124 | + ); |
| 125 | + )*}; |
| 126 | + } |
| 127 | + |
| 128 | + unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); |
| 129 | +} |
| 130 | + |
| 131 | +#[inline(always)] |
| 132 | +unsafe fn rounds_0_63_avx(current_state: &mut State, x: &mut [__m128i; 8], ms: &mut MsgSchedule) { |
| 133 | + let mut k64_idx: usize = SHA512_BLOCK_WORDS_NUM; |
| 134 | + |
| 135 | + for _ in 0..4 { |
| 136 | + for j in 0..8 { |
| 137 | + let k64 = _mm_loadu_si128(&K64[k64_idx] as *const u64 as *const _); |
| 138 | + let y = sha512_update_x_avx(x, k64); |
| 139 | + |
| 140 | + sha_round(current_state, ms[2 * j]); |
| 141 | + sha_round(current_state, ms[2 * j + 1]); |
| 142 | + |
| 143 | + _mm_store_si128(&mut ms[2 * j] as *const u64 as *mut _, y); |
| 144 | + k64_idx += 2; |
| 145 | + } |
| 146 | + } |
| 147 | +} |
| 148 | + |
| 149 | +#[inline(always)] |
| 150 | +unsafe fn rounds_0_63_avx2( |
| 151 | + current_state: &mut State, |
| 152 | + x: &mut [__m256i; 8], |
| 153 | + ms: &mut MsgSchedule, |
| 154 | + t2: &mut RoundStates, |
| 155 | +) { |
| 156 | + let mut k64x4_idx: usize = SHA512_BLOCK_WORDS_NUM; |
| 157 | + |
| 158 | + for i in 1..5 { |
| 159 | + for j in 0..8 { |
| 160 | + let t = _mm_loadu_si128(K64.as_ptr().add(k64x4_idx) as *const u64 as *const _); |
| 161 | + let y = sha512_update_x_avx2(x, _mm256_set_m128i(t, t)); |
| 162 | + |
| 163 | + sha_round(current_state, ms[2 * j]); |
| 164 | + sha_round(current_state, ms[2 * j + 1]); |
| 165 | + |
| 166 | + _mm_store_si128( |
| 167 | + &mut ms[2 * j] as *mut u64 as *mut _, |
| 168 | + _mm256_extracti128_si256(y, 0), |
| 169 | + ); |
| 170 | + _mm_store_si128( |
| 171 | + &mut t2[(16 * i) + 2 * j] as *mut u64 as *mut _, |
| 172 | + _mm256_extracti128_si256(y, 1), |
| 173 | + ); |
| 174 | + |
| 175 | + k64x4_idx += 2; |
| 176 | + } |
| 177 | + } |
| 178 | +} |
| 179 | + |
| 180 | +#[inline(always)] |
| 181 | +unsafe fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) { |
| 182 | + for i in 64..80 { |
| 183 | + sha_round(current_state, ms[i & 0xf]); |
| 184 | + } |
| 185 | +} |
| 186 | + |
| 187 | +#[inline(always)] |
| 188 | +unsafe fn process_second_block(current_state: &mut State, t2: &RoundStates) { |
| 189 | + for t2 in t2.iter() { |
| 190 | + sha_round(current_state, *t2); |
| 191 | + } |
| 192 | +} |
| 193 | + |
| 194 | +#[inline(always)] |
| 195 | +unsafe fn sha_round(s: &mut State, x: u64) { |
| 196 | + macro_rules! big_sigma0 { |
| 197 | + ($a:expr) => { |
| 198 | + $a.rotate_right(28) ^ $a.rotate_right(34) ^ $a.rotate_right(39) |
| 199 | + }; |
| 200 | + } |
| 201 | + macro_rules! big_sigma1 { |
| 202 | + ($a:expr) => { |
| 203 | + $a.rotate_right(14) ^ $a.rotate_right(18) ^ $a.rotate_right(41) |
| 204 | + }; |
| 205 | + } |
| 206 | + macro_rules! bool3ary_202 { |
| 207 | + ($a:expr, $b:expr, $c:expr) => { |
| 208 | + $c ^ ($a & ($b ^ $c)) |
| 209 | + }; |
| 210 | + } // Choose, MD5F, SHA1C |
| 211 | + macro_rules! bool3ary_232 { |
| 212 | + ($a:expr, $b:expr, $c:expr) => { |
| 213 | + ($a & $b) ^ ($a & $c) ^ ($b & $c) |
| 214 | + }; |
| 215 | + } // Majority, SHA1M |
| 216 | + |
| 217 | + macro_rules! rotate_state { |
| 218 | + ($s:ident) => {{ |
| 219 | + let tmp = $s[7]; |
| 220 | + $s[7] = $s[6]; |
| 221 | + $s[6] = $s[5]; |
| 222 | + $s[5] = $s[4]; |
| 223 | + $s[4] = $s[3]; |
| 224 | + $s[3] = $s[2]; |
| 225 | + $s[2] = $s[1]; |
| 226 | + $s[1] = $s[0]; |
| 227 | + $s[0] = tmp; |
| 228 | + }}; |
| 229 | + } |
| 230 | + |
| 231 | + let t = x |
| 232 | + .wrapping_add(s[7]) |
| 233 | + .wrapping_add(big_sigma1!(s[4])) |
| 234 | + .wrapping_add(bool3ary_202!(s[4], s[5], s[6])); |
| 235 | + |
| 236 | + s[7] = t |
| 237 | + .wrapping_add(big_sigma0!(s[0])) |
| 238 | + .wrapping_add(bool3ary_232!(s[0], s[1], s[2])); |
| 239 | + s[3] = s[3].wrapping_add(t); |
| 240 | + |
| 241 | + rotate_state!(s); |
| 242 | +} |
| 243 | + |
| 244 | +#[inline(always)] |
| 245 | +unsafe fn accumulate_state(dst: &mut State, src: &State) { |
| 246 | + for i in 0..SHA512_HASH_WORDS_NUM { |
| 247 | + dst[i] = dst[i].wrapping_add(src[i]); |
| 248 | + } |
| 249 | +} |
| 250 | + |
| 251 | +macro_rules! fn_sha512_update_x { |
| 252 | + ($name:ident, $ty:ident, { |
| 253 | + ADD64 = $ADD64:ident, |
| 254 | + ALIGNR8 = $ALIGNR8:ident, |
| 255 | + SRL64 = $SRL64:ident, |
| 256 | + SLL64 = $SLL64:ident, |
| 257 | + XOR = $XOR:ident, |
| 258 | + }) => { |
| 259 | + unsafe fn $name(x: &mut [$ty; 8], k64: $ty) -> $ty { |
| 260 | + // q[2:1] |
| 261 | + let mut t0 = $ALIGNR8(x[1], x[0], 8); |
| 262 | + // q[10:9] |
| 263 | + let mut t3 = $ALIGNR8(x[5], x[4], 8); |
| 264 | + // q[2:1] >> s0[0] |
| 265 | + let mut t2 = $SRL64(t0, 1); |
| 266 | + // q[1:0] + q[10:9] |
| 267 | + x[0] = $ADD64(x[0], t3); |
| 268 | + // q[2:1] >> s0[2] |
| 269 | + t3 = $SRL64(t0, 7); |
| 270 | + // q[2:1] << (64 - s0[1]) |
| 271 | + let mut t1 = $SLL64(t0, 64 - 8); |
| 272 | + // (q[2:1] >> s0[2]) ^ |
| 273 | + // (q[2:1] >> s0[0]) |
| 274 | + t0 = $XOR(t3, t2); |
| 275 | + // q[2:1] >> s0[1] |
| 276 | + t2 = $SRL64(t2, 8 - 1); |
| 277 | + // (q[2:1] >> s0[2]) ^ |
| 278 | + // (q[2:1] >> s0[0]) ^ |
| 279 | + // q[2:1] << (64 - s0[1]) |
| 280 | + t0 = $XOR(t0, t1); |
| 281 | + // q[2:1] << (64 - s0[0]) |
| 282 | + t1 = $SLL64(t1, 8 - 1); |
| 283 | + // sigma1(q[2:1]) |
| 284 | + t0 = $XOR(t0, t2); |
| 285 | + t0 = $XOR(t0, t1); |
| 286 | + // q[15:14] >> s1[2] |
| 287 | + t3 = $SRL64(x[7], 6); |
| 288 | + // q[15:14] >> (64 - s1[1]) |
| 289 | + t2 = $SLL64(x[7], 64 - 61); |
| 290 | + // q[1:0] + sigma0(q[2:1]) |
| 291 | + x[0] = $ADD64(x[0], t0); |
| 292 | + // q[15:14] >> s1[0] |
| 293 | + t1 = $SRL64(x[7], 19); |
| 294 | + // q[15:14] >> s1[2] ^ |
| 295 | + // q[15:14] >> (64 - s1[1]) |
| 296 | + t3 = $XOR(t3, t2); |
| 297 | + // q[15:14] >> (64 - s1[0]) |
| 298 | + t2 = $SLL64(t2, 61 - 19); |
| 299 | + // q[15:14] >> s1[2] ^ |
| 300 | + // q[15:14] >> (64 - s1[1] ^ |
| 301 | + // q[15:14] >> s1[0] |
| 302 | + t3 = $XOR(t3, t1); |
| 303 | + // q[15:14] >> s1[1] |
| 304 | + t1 = $SRL64(t1, 61 - 19); |
| 305 | + // sigma1(q[15:14]) |
| 306 | + t3 = $XOR(t3, t2); |
| 307 | + t3 = $XOR(t3, t1); |
| 308 | + |
| 309 | + // q[1:0] + q[10:9] + sigma1(q[15:14]) + sigma0(q[2:1]) |
| 310 | + x[0] = $ADD64(x[0], t3); |
| 311 | + |
| 312 | + // rotate |
| 313 | + let temp = x[0]; |
| 314 | + x[0] = x[1]; |
| 315 | + x[1] = x[2]; |
| 316 | + x[2] = x[3]; |
| 317 | + x[3] = x[4]; |
| 318 | + x[4] = x[5]; |
| 319 | + x[5] = x[6]; |
| 320 | + x[6] = x[7]; |
| 321 | + x[7] = temp; |
| 322 | + |
| 323 | + $ADD64(x[7], k64) |
| 324 | + } |
| 325 | + }; |
| 326 | +} |
| 327 | + |
| 328 | +fn_sha512_update_x!(sha512_update_x_avx, __m128i, { |
| 329 | + ADD64 = _mm_add_epi64, |
| 330 | + ALIGNR8 = _mm_alignr_epi8, |
| 331 | + SRL64 = _mm_srli_epi64, |
| 332 | + SLL64 = _mm_slli_epi64, |
| 333 | + XOR = _mm_xor_si128, |
| 334 | +}); |
| 335 | + |
| 336 | +fn_sha512_update_x!(sha512_update_x_avx2, __m256i, { |
| 337 | + ADD64 = _mm256_add_epi64, |
| 338 | + ALIGNR8 = _mm256_alignr_epi8, |
| 339 | + SRL64 = _mm256_srli_epi64, |
| 340 | + SLL64 = _mm256_slli_epi64, |
| 341 | + XOR = _mm256_xor_si256, |
| 342 | +}); |
| 343 | + |
| 344 | +type State = [u64; SHA512_HASH_WORDS_NUM]; |
| 345 | +type MsgSchedule = [u64; SHA512_BLOCK_WORDS_NUM]; |
| 346 | +type RoundStates = [u64; SHA512_ROUNDS_NUM]; |
| 347 | + |
| 348 | +const SHA512_BLOCK_BYTE_LEN: usize = 128; |
| 349 | +const SHA512_ROUNDS_NUM: usize = 80; |
| 350 | +const SHA512_HASH_BYTE_LEN: usize = 64; |
| 351 | +const SHA512_HASH_WORDS_NUM: usize = SHA512_HASH_BYTE_LEN / size_of::<u64>(); |
| 352 | +const SHA512_BLOCK_WORDS_NUM: usize = SHA512_BLOCK_BYTE_LEN / size_of::<u64>(); |
0 commit comments