Skip to content

Commit efda00d

Browse files
authored
Add x86 intrinsics support for sha2-512 (#312)
1 parent 4a2845c commit efda00d

File tree

2 files changed

+362
-3
lines changed

2 files changed

+362
-3
lines changed

sha2/src/sha512.rs

+10-3
Original file line numberDiff line numberDiff line change
@@ -231,10 +231,17 @@ cfg_if::cfg_if! {
231231
if #[cfg(feature = "force-soft")] {
232232
mod soft;
233233
use soft::compress;
234-
} else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] {
235-
fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
236-
sha2_asm::compress512(state, blocks);
234+
} else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
235+
#[cfg(not(feature = "asm"))]
236+
mod soft;
237+
#[cfg(feature = "asm")]
238+
mod soft {
239+
pub(crate) fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
240+
sha2_asm::compress512(state, blocks);
241+
}
237242
}
243+
mod x86;
244+
use x86::compress;
238245
} else {
239246
mod soft;
240247
use soft::compress;

sha2/src/sha512/x86.rs

+352
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
//! SHA-512 `x86`/`x86_64` backend
2+
3+
#![allow(clippy::many_single_char_names)]
4+
5+
use core::mem::size_of;
6+
7+
#[cfg(target_arch = "x86")]
8+
use core::arch::x86::*;
9+
#[cfg(target_arch = "x86_64")]
10+
use core::arch::x86_64::*;
11+
12+
use crate::consts::K64;
13+
14+
cpufeatures::new!(avx2_cpuid, "avx2");
15+
16+
pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
17+
// TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725
18+
// after stabilization
19+
if avx2_cpuid::get() {
20+
unsafe {
21+
sha512_compress_x86_64_avx2(state, blocks);
22+
}
23+
} else {
24+
super::soft::compress(state, blocks);
25+
}
26+
}
27+
28+
#[target_feature(enable = "avx2")]
29+
unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) {
30+
let mut start_block = 0;
31+
32+
if blocks.len() & 0b1 != 0 {
33+
sha512_compress_x86_64_avx(state, &blocks[0]);
34+
start_block += 1;
35+
}
36+
37+
let mut ms: MsgSchedule = Default::default();
38+
let mut t2: RoundStates = [0u64; SHA512_ROUNDS_NUM];
39+
let mut x = [_mm256_setzero_si256(); 8];
40+
41+
for i in (start_block..blocks.len()).step_by(2) {
42+
load_data_avx2(&mut x, &mut ms, &mut t2, blocks.as_ptr().add(i) as *const _);
43+
44+
// First block
45+
let mut current_state = *state;
46+
rounds_0_63_avx2(&mut current_state, &mut x, &mut ms, &mut t2);
47+
rounds_64_79(&mut current_state, &ms);
48+
accumulate_state(state, &current_state);
49+
50+
// Second block
51+
current_state = *state;
52+
process_second_block(&mut current_state, &t2);
53+
accumulate_state(state, &current_state);
54+
}
55+
}
56+
57+
#[inline(always)]
58+
unsafe fn sha512_compress_x86_64_avx(state: &mut [u64; 8], block: &[u8; 128]) {
59+
let mut ms = Default::default();
60+
let mut x = [_mm_setzero_si128(); 8];
61+
62+
// Reduced to single iteration
63+
let mut current_state = *state;
64+
load_data_avx(&mut x, &mut ms, block.as_ptr() as *const _);
65+
rounds_0_63_avx(&mut current_state, &mut x, &mut ms);
66+
rounds_64_79(&mut current_state, &ms);
67+
accumulate_state(state, &current_state);
68+
}
69+
70+
#[inline(always)]
71+
unsafe fn load_data_avx(x: &mut [__m128i; 8], ms: &mut MsgSchedule, data: *const __m128i) {
72+
#[allow(non_snake_case)]
73+
let MASK = _mm_setr_epi32(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b);
74+
75+
macro_rules! unrolled_iterations {
76+
($($i:literal),*) => {$(
77+
x[$i] = _mm_loadu_si128(data.add($i) as *const _);
78+
x[$i] = _mm_shuffle_epi8(x[$i], MASK);
79+
80+
let y = _mm_add_epi64(
81+
x[$i],
82+
_mm_loadu_si128(&K64[2 * $i] as *const u64 as *const _),
83+
);
84+
85+
_mm_store_si128(&mut ms[2 * $i] as *mut u64 as *mut _, y);
86+
)*};
87+
}
88+
89+
unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7);
90+
}
91+
92+
#[inline(always)]
93+
unsafe fn load_data_avx2(
94+
x: &mut [__m256i; 8],
95+
ms: &mut MsgSchedule,
96+
t2: &mut RoundStates,
97+
data: *const __m128i,
98+
) {
99+
#[allow(non_snake_case)]
100+
let MASK = _mm256_set_epi64x(
101+
0x0809_0A0B_0C0D_0E0F_i64,
102+
0x0001_0203_0405_0607_i64,
103+
0x0809_0A0B_0C0D_0E0F_i64,
104+
0x0001_0203_0405_0607_i64,
105+
);
106+
107+
macro_rules! unrolled_iterations {
108+
($($i:literal),*) => {$(
109+
x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i) as *const _), 1);
110+
x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i + 1) as *const _), 0);
111+
112+
x[$i] = _mm256_shuffle_epi8(x[$i], MASK);
113+
114+
let t = _mm_loadu_si128(K64.as_ptr().add($i * 2) as *const u64 as *const _);
115+
let y = _mm256_add_epi64(x[$i], _mm256_set_m128i(t, t));
116+
117+
_mm_store_si128(
118+
&mut ms[2 * $i] as *mut u64 as *mut _,
119+
_mm256_extracti128_si256(y, 0),
120+
);
121+
_mm_store_si128(
122+
&mut t2[2 * $i] as *mut u64 as *mut _,
123+
_mm256_extracti128_si256(y, 1),
124+
);
125+
)*};
126+
}
127+
128+
unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7);
129+
}
130+
131+
#[inline(always)]
132+
unsafe fn rounds_0_63_avx(current_state: &mut State, x: &mut [__m128i; 8], ms: &mut MsgSchedule) {
133+
let mut k64_idx: usize = SHA512_BLOCK_WORDS_NUM;
134+
135+
for _ in 0..4 {
136+
for j in 0..8 {
137+
let k64 = _mm_loadu_si128(&K64[k64_idx] as *const u64 as *const _);
138+
let y = sha512_update_x_avx(x, k64);
139+
140+
sha_round(current_state, ms[2 * j]);
141+
sha_round(current_state, ms[2 * j + 1]);
142+
143+
_mm_store_si128(&mut ms[2 * j] as *const u64 as *mut _, y);
144+
k64_idx += 2;
145+
}
146+
}
147+
}
148+
149+
#[inline(always)]
150+
unsafe fn rounds_0_63_avx2(
151+
current_state: &mut State,
152+
x: &mut [__m256i; 8],
153+
ms: &mut MsgSchedule,
154+
t2: &mut RoundStates,
155+
) {
156+
let mut k64x4_idx: usize = SHA512_BLOCK_WORDS_NUM;
157+
158+
for i in 1..5 {
159+
for j in 0..8 {
160+
let t = _mm_loadu_si128(K64.as_ptr().add(k64x4_idx) as *const u64 as *const _);
161+
let y = sha512_update_x_avx2(x, _mm256_set_m128i(t, t));
162+
163+
sha_round(current_state, ms[2 * j]);
164+
sha_round(current_state, ms[2 * j + 1]);
165+
166+
_mm_store_si128(
167+
&mut ms[2 * j] as *mut u64 as *mut _,
168+
_mm256_extracti128_si256(y, 0),
169+
);
170+
_mm_store_si128(
171+
&mut t2[(16 * i) + 2 * j] as *mut u64 as *mut _,
172+
_mm256_extracti128_si256(y, 1),
173+
);
174+
175+
k64x4_idx += 2;
176+
}
177+
}
178+
}
179+
180+
#[inline(always)]
181+
unsafe fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) {
182+
for i in 64..80 {
183+
sha_round(current_state, ms[i & 0xf]);
184+
}
185+
}
186+
187+
#[inline(always)]
188+
unsafe fn process_second_block(current_state: &mut State, t2: &RoundStates) {
189+
for t2 in t2.iter() {
190+
sha_round(current_state, *t2);
191+
}
192+
}
193+
194+
#[inline(always)]
195+
unsafe fn sha_round(s: &mut State, x: u64) {
196+
macro_rules! big_sigma0 {
197+
($a:expr) => {
198+
$a.rotate_right(28) ^ $a.rotate_right(34) ^ $a.rotate_right(39)
199+
};
200+
}
201+
macro_rules! big_sigma1 {
202+
($a:expr) => {
203+
$a.rotate_right(14) ^ $a.rotate_right(18) ^ $a.rotate_right(41)
204+
};
205+
}
206+
macro_rules! bool3ary_202 {
207+
($a:expr, $b:expr, $c:expr) => {
208+
$c ^ ($a & ($b ^ $c))
209+
};
210+
} // Choose, MD5F, SHA1C
211+
macro_rules! bool3ary_232 {
212+
($a:expr, $b:expr, $c:expr) => {
213+
($a & $b) ^ ($a & $c) ^ ($b & $c)
214+
};
215+
} // Majority, SHA1M
216+
217+
macro_rules! rotate_state {
218+
($s:ident) => {{
219+
let tmp = $s[7];
220+
$s[7] = $s[6];
221+
$s[6] = $s[5];
222+
$s[5] = $s[4];
223+
$s[4] = $s[3];
224+
$s[3] = $s[2];
225+
$s[2] = $s[1];
226+
$s[1] = $s[0];
227+
$s[0] = tmp;
228+
}};
229+
}
230+
231+
let t = x
232+
.wrapping_add(s[7])
233+
.wrapping_add(big_sigma1!(s[4]))
234+
.wrapping_add(bool3ary_202!(s[4], s[5], s[6]));
235+
236+
s[7] = t
237+
.wrapping_add(big_sigma0!(s[0]))
238+
.wrapping_add(bool3ary_232!(s[0], s[1], s[2]));
239+
s[3] = s[3].wrapping_add(t);
240+
241+
rotate_state!(s);
242+
}
243+
244+
#[inline(always)]
245+
unsafe fn accumulate_state(dst: &mut State, src: &State) {
246+
for i in 0..SHA512_HASH_WORDS_NUM {
247+
dst[i] = dst[i].wrapping_add(src[i]);
248+
}
249+
}
250+
251+
macro_rules! fn_sha512_update_x {
252+
($name:ident, $ty:ident, {
253+
ADD64 = $ADD64:ident,
254+
ALIGNR8 = $ALIGNR8:ident,
255+
SRL64 = $SRL64:ident,
256+
SLL64 = $SLL64:ident,
257+
XOR = $XOR:ident,
258+
}) => {
259+
unsafe fn $name(x: &mut [$ty; 8], k64: $ty) -> $ty {
260+
// q[2:1]
261+
let mut t0 = $ALIGNR8(x[1], x[0], 8);
262+
// q[10:9]
263+
let mut t3 = $ALIGNR8(x[5], x[4], 8);
264+
// q[2:1] >> s0[0]
265+
let mut t2 = $SRL64(t0, 1);
266+
// q[1:0] + q[10:9]
267+
x[0] = $ADD64(x[0], t3);
268+
// q[2:1] >> s0[2]
269+
t3 = $SRL64(t0, 7);
270+
// q[2:1] << (64 - s0[1])
271+
let mut t1 = $SLL64(t0, 64 - 8);
272+
// (q[2:1] >> s0[2]) ^
273+
// (q[2:1] >> s0[0])
274+
t0 = $XOR(t3, t2);
275+
// q[2:1] >> s0[1]
276+
t2 = $SRL64(t2, 8 - 1);
277+
// (q[2:1] >> s0[2]) ^
278+
// (q[2:1] >> s0[0]) ^
279+
// q[2:1] << (64 - s0[1])
280+
t0 = $XOR(t0, t1);
281+
// q[2:1] << (64 - s0[0])
282+
t1 = $SLL64(t1, 8 - 1);
283+
// sigma1(q[2:1])
284+
t0 = $XOR(t0, t2);
285+
t0 = $XOR(t0, t1);
286+
// q[15:14] >> s1[2]
287+
t3 = $SRL64(x[7], 6);
288+
// q[15:14] >> (64 - s1[1])
289+
t2 = $SLL64(x[7], 64 - 61);
290+
// q[1:0] + sigma0(q[2:1])
291+
x[0] = $ADD64(x[0], t0);
292+
// q[15:14] >> s1[0]
293+
t1 = $SRL64(x[7], 19);
294+
// q[15:14] >> s1[2] ^
295+
// q[15:14] >> (64 - s1[1])
296+
t3 = $XOR(t3, t2);
297+
// q[15:14] >> (64 - s1[0])
298+
t2 = $SLL64(t2, 61 - 19);
299+
// q[15:14] >> s1[2] ^
300+
// q[15:14] >> (64 - s1[1] ^
301+
// q[15:14] >> s1[0]
302+
t3 = $XOR(t3, t1);
303+
// q[15:14] >> s1[1]
304+
t1 = $SRL64(t1, 61 - 19);
305+
// sigma1(q[15:14])
306+
t3 = $XOR(t3, t2);
307+
t3 = $XOR(t3, t1);
308+
309+
// q[1:0] + q[10:9] + sigma1(q[15:14]) + sigma0(q[2:1])
310+
x[0] = $ADD64(x[0], t3);
311+
312+
// rotate
313+
let temp = x[0];
314+
x[0] = x[1];
315+
x[1] = x[2];
316+
x[2] = x[3];
317+
x[3] = x[4];
318+
x[4] = x[5];
319+
x[5] = x[6];
320+
x[6] = x[7];
321+
x[7] = temp;
322+
323+
$ADD64(x[7], k64)
324+
}
325+
};
326+
}
327+
328+
fn_sha512_update_x!(sha512_update_x_avx, __m128i, {
329+
ADD64 = _mm_add_epi64,
330+
ALIGNR8 = _mm_alignr_epi8,
331+
SRL64 = _mm_srli_epi64,
332+
SLL64 = _mm_slli_epi64,
333+
XOR = _mm_xor_si128,
334+
});
335+
336+
fn_sha512_update_x!(sha512_update_x_avx2, __m256i, {
337+
ADD64 = _mm256_add_epi64,
338+
ALIGNR8 = _mm256_alignr_epi8,
339+
SRL64 = _mm256_srli_epi64,
340+
SLL64 = _mm256_slli_epi64,
341+
XOR = _mm256_xor_si256,
342+
});
343+
344+
type State = [u64; SHA512_HASH_WORDS_NUM];
345+
type MsgSchedule = [u64; SHA512_BLOCK_WORDS_NUM];
346+
type RoundStates = [u64; SHA512_ROUNDS_NUM];
347+
348+
const SHA512_BLOCK_BYTE_LEN: usize = 128;
349+
const SHA512_ROUNDS_NUM: usize = 80;
350+
const SHA512_HASH_BYTE_LEN: usize = 64;
351+
const SHA512_HASH_WORDS_NUM: usize = SHA512_HASH_BYTE_LEN / size_of::<u64>();
352+
const SHA512_BLOCK_WORDS_NUM: usize = SHA512_BLOCK_BYTE_LEN / size_of::<u64>();

0 commit comments

Comments
 (0)