From ba0810dfc43ce3a11d22933bb848f9308114ad9f Mon Sep 17 00:00:00 2001 From: Ivan Kalinin Date: Mon, 6 Sep 2021 03:54:55 +0300 Subject: [PATCH 1/6] sha2: Add intrinsic version of sha512 for x86 --- sha2/src/consts.rs | 50 ++++++ sha2/src/sha512.rs | 13 +- sha2/src/sha512/x86.rs | 394 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 454 insertions(+), 3 deletions(-) create mode 100644 sha2/src/sha512/x86.rs diff --git a/sha2/src/consts.rs b/sha2/src/consts.rs index f126dc616..4aaff0f41 100644 --- a/sha2/src/consts.rs +++ b/sha2/src/consts.rs @@ -163,6 +163,56 @@ pub const K64X2: [[u64; 2]; 40] = [ [K64[79], K64[78]], ]; +macro_rules! dup_array { + ([$([$a:expr, $b:expr]),*,]) => {[ + $($b, $a, $b, $a),*, + ]} +} + +/// Constants necessary for SHA-512 family of digests. +pub const K64X4: [u64; 160] = dup_array!([ + [K64[1], K64[0]], + [K64[3], K64[2]], + [K64[5], K64[4]], + [K64[7], K64[6]], + [K64[9], K64[8]], + [K64[11], K64[10]], + [K64[13], K64[12]], + [K64[15], K64[14]], + [K64[17], K64[16]], + [K64[19], K64[18]], + [K64[21], K64[20]], + [K64[23], K64[22]], + [K64[25], K64[24]], + [K64[27], K64[26]], + [K64[29], K64[28]], + [K64[31], K64[30]], + [K64[33], K64[32]], + [K64[35], K64[34]], + [K64[37], K64[36]], + [K64[39], K64[38]], + [K64[41], K64[40]], + [K64[43], K64[42]], + [K64[45], K64[44]], + [K64[47], K64[46]], + [K64[49], K64[48]], + [K64[51], K64[50]], + [K64[53], K64[52]], + [K64[55], K64[54]], + [K64[57], K64[56]], + [K64[59], K64[58]], + [K64[61], K64[60]], + [K64[63], K64[62]], + [K64[65], K64[64]], + [K64[67], K64[66]], + [K64[69], K64[68]], + [K64[71], K64[70]], + [K64[73], K64[72]], + [K64[75], K64[74]], + [K64[77], K64[76]], + [K64[79], K64[78]], +]); + pub static H224: [u32; STATE_LEN] = [ 0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4, ]; diff --git a/sha2/src/sha512.rs b/sha2/src/sha512.rs index 02439fd63..078d4b786 100644 --- a/sha2/src/sha512.rs +++ b/sha2/src/sha512.rs @@ -231,10 +231,17 @@ cfg_if::cfg_if! { if #[cfg(feature = "force-soft")] { mod soft; use soft::compress; - } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] { - fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { - sha2_asm::compress512(state, blocks); + } else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { + #[cfg(not(feature = "asm"))] + mod soft; + #[cfg(feature = "asm")] + mod soft { + fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + sha2_asm::compress512(state, blocks); + } } + mod x86; + use x86::compress; } else { mod soft; use soft::compress; diff --git a/sha2/src/sha512/x86.rs b/sha2/src/sha512/x86.rs new file mode 100644 index 000000000..69bdaa55d --- /dev/null +++ b/sha2/src/sha512/x86.rs @@ -0,0 +1,394 @@ +//! SHA-512 `x86`/`x86_64` backend + +#![allow(clippy::many_single_char_names)] + +use core::mem::size_of; + +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::consts::{K64, K64X4}; + +const SHA512_BLOCK_BYTE_LEN: usize = 128; +const SHA512_ROUNDS_NUM: usize = 80; +const SHA512_HASH_BYTE_LEN: usize = 64; +const SHA512_HASH_WORDS_NUM: usize = SHA512_HASH_BYTE_LEN / size_of::(); +const SHA512_BLOCK_WORDS_NUM: usize = SHA512_BLOCK_BYTE_LEN / size_of::(); + +const MS_VEC_NUM_AVX: usize = SHA512_BLOCK_BYTE_LEN / size_of::<__m128i>(); +const MS_VEC_NUM_AVX2: usize = (2 * SHA512_BLOCK_BYTE_LEN) / size_of::<__m256i>(); +const WORDS_IN_128_BIT_VEC: usize = 16 / size_of::(); +const WORDS_IN_VEC_AVX: usize = size_of::<__m128i>() / size_of::(); +const WORDS_IN_VEC_AVX2: usize = size_of::<__m256i>() / size_of::(); + +type State = [u64; SHA512_HASH_WORDS_NUM]; +type MsgSchedule = [u64; SHA512_BLOCK_WORDS_NUM]; +type RoundStates = [u64; SHA512_ROUNDS_NUM]; + +#[inline(always)] +unsafe fn load_data_avx( + x: &mut [__m128i; MS_VEC_NUM_AVX], + ms: &mut MsgSchedule, + data: *const __m128i, +) { + #[allow(non_snake_case)] + let MASK = _mm_setr_epi32(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b); + + macro_rules! unrolled_iterations { + ($($i:literal),*) => {$( + x[$i] = _mm_loadu_si128(data.add($i) as *const _); + x[$i] = _mm_shuffle_epi8(x[$i], MASK); + + let y = _mm_add_epi64( + x[$i], + _mm_loadu_si128(&K64[WORDS_IN_VEC_AVX * $i] as *const u64 as *const _), + ); + + _mm_store_si128(&mut ms[WORDS_IN_VEC_AVX * $i] as *mut u64 as *mut _, y); + )*}; + } + + unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); +} + +#[inline(always)] +unsafe fn load_data_avx2( + x: &mut [__m256i; MS_VEC_NUM_AVX2], + ms: &mut MsgSchedule, + t2: &mut RoundStates, + data: *const __m128i, +) { + #[allow(non_snake_case)] + let MASK = _mm256_set_epi64x( + 0x0809_0A0B_0C0D_0E0F_i64, + 0x0001_0203_0405_0607_i64, + 0x0809_0A0B_0C0D_0E0F_i64, + 0x0001_0203_0405_0607_i64, + ); + + macro_rules! unrolled_iterations { + ($($i:literal),*) => {$( + x[$i] = _mm256_insertf128_si256::<1>(x[$i], _mm_loadu_si128(data.add($i) as *const _)); + x[$i] = _mm256_insertf128_si256::<0>(x[$i], _mm_loadu_si128(data.add($i + 1) as *const _)); + + x[$i] = _mm256_shuffle_epi8(x[$i], MASK); + let y = _mm256_add_epi64( + x[$i], + _mm256_loadu_si256(&K64X4[4 * $i] as *const u64 as *const _), + ); + + _mm_store_si128( + &mut ms[2 * $i] as *mut u64 as *mut _, + _mm256_extracti128_si256::<0>(y), + ); + _mm_store_si128( + &mut t2[2 * $i] as *mut u64 as *mut _, + _mm256_extracti128_si256::<1>(y), + ); + )*}; + } + + unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); +} + +#[inline(always)] +unsafe fn rounds_0_63_avx( + current_state: &mut State, + x: &mut [__m128i; MS_VEC_NUM_AVX], + ms: &mut MsgSchedule, +) { + let mut k64_idx: usize = SHA512_BLOCK_WORDS_NUM; + + for _ in 0..4 { + macro_rules! unrolled_iterations { + ($($j:literal),*) => {$( + let y = sha512_update_x_avx(x, &K64[k64_idx] as *const u64 as *const _); + + sha_round(current_state, ms[WORDS_IN_VEC_AVX * $j]); + sha_round(current_state, ms[WORDS_IN_VEC_AVX * $j + 1]); + + _mm_store_si128(&mut ms[WORDS_IN_VEC_AVX * $j] as *const u64 as *mut _, y); + k64_idx += WORDS_IN_VEC_AVX; + )*}; + } + + unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); + } +} + +#[inline(always)] +unsafe fn rounds_0_63_avx2( + current_state: &mut State, + x: &mut [__m256i; MS_VEC_NUM_AVX2], + ms: &mut MsgSchedule, + t2: &mut RoundStates, +) { + let mut k64x2_idx: usize = 2 * SHA512_BLOCK_WORDS_NUM; + + for i in 1..5 { + macro_rules! unrolled_iterations { + ($($j:literal),*) => {$( + let y = sha512_update_x_avx2(x, &K64X4[k64x2_idx] as *const u64 as *const _); + + sha_round(current_state, ms[WORDS_IN_128_BIT_VEC * $j]); + sha_round(current_state, ms[WORDS_IN_128_BIT_VEC * $j + 1]); + + _mm_store_si128( + &mut ms[WORDS_IN_128_BIT_VEC * $j] as *mut u64 as *mut _, + _mm256_extracti128_si256::<0>(y), + ); + _mm_store_si128( + &mut t2[(16 * i) + WORDS_IN_128_BIT_VEC * $j] as *mut u64 as *mut _, + _mm256_extracti128_si256::<1>(y), + ); + + k64x2_idx += WORDS_IN_VEC_AVX2; + )*}; + } + + unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); + } +} + +#[inline(always)] +unsafe fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) { + macro_rules! unrolled_iterations { + ($($i:literal),*) => {$( + sha_round(current_state, ms[$i & 0xf]); + )*}; + } + + unrolled_iterations!(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79); +} + +#[inline(always)] +unsafe fn process_second_block(current_state: &mut State, t2: RoundStates) { + macro_rules! unrolled_iterations { + ($($i:literal),*) => {$( + sha_round(current_state, t2[$i]); + )*}; + } + + unrolled_iterations!( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, + 71, 72, 73, 74, 75, 76, 77, 78, 79 + ); +} + +#[inline(always)] +unsafe fn sha_round(s: &mut State, x: u64) { + macro_rules! big_sigma0 { + ($a:expr) => { + $a.rotate_right(28) ^ $a.rotate_right(34) ^ $a.rotate_right(39) + }; + } + macro_rules! big_sigma1 { + ($a:expr) => { + $a.rotate_right(14) ^ $a.rotate_right(18) ^ $a.rotate_right(41) + }; + } + macro_rules! bool3ary_202 { + ($a:expr, $b:expr, $c:expr) => { + $c ^ ($a & ($b ^ $c)) + }; + } // Choose, MD5F, SHA1C + macro_rules! bool3ary_232 { + ($a:expr, $b:expr, $c:expr) => { + ($a & $b) ^ ($a & $c) ^ ($b & $c) + }; + } // Majority, SHA1M + + macro_rules! rotate_state { + ($s:ident) => {{ + let tmp = $s[7]; + $s[7] = $s[6]; + $s[6] = $s[5]; + $s[5] = $s[4]; + $s[4] = $s[3]; + $s[3] = $s[2]; + $s[2] = $s[1]; + $s[1] = $s[0]; + $s[0] = tmp; + }}; + } + + let t = x + .wrapping_add(s[7]) + .wrapping_add(big_sigma1!(s[4])) + .wrapping_add(bool3ary_202!(s[4], s[5], s[6])); + + s[7] = t + .wrapping_add(big_sigma0!(s[0])) + .wrapping_add(bool3ary_232!(s[0], s[1], s[2])); + s[3] = s[3].wrapping_add(t); + + rotate_state!(s); +} + +#[inline(always)] +unsafe fn accumulate_state(dst: &mut State, src: &State) { + for i in 0..SHA512_HASH_WORDS_NUM { + dst[i] = dst[i].wrapping_add(src[i]); + } +} + +macro_rules! fn_sha512_update_x { + ($name:ident, $ty:ident, { + LOAD = $LOAD:ident, + ADD64 = $ADD64:ident, + ALIGNR8 = $ALIGNR8:ident, + SRL64 = $SRL64:ident, + SLL64 = $SLL64:ident, + XOR = $XOR:ident, + }) => { + unsafe fn $name(x: &mut [$ty; 8], k64_p: *const $ty) -> $ty { + // q[2:1] + let mut t0 = $ALIGNR8::<8>(x[1], x[0]); + // q[10:9] + let mut t3 = $ALIGNR8::<8>(x[5], x[4]); + // q[2:1] >> s0[0] + let mut t2 = $SRL64::<1>(t0); + // q[1:0] + q[10:9] + x[0] = $ADD64(x[0], t3); + // q[2:1] >> s0[2] + t3 = $SRL64::<7>(t0); + // q[2:1] << (64 - s0[1]) + let mut t1 = $SLL64::<{ 64 - 8 }>(t0); + // (q[2:1] >> s0[2]) ^ + // (q[2:1] >> s0[0]) + t0 = $XOR(t3, t2); + // q[2:1] >> s0[1] + t2 = $SRL64::<{ 8 - 1 }>(t2); + // (q[2:1] >> s0[2]) ^ + // (q[2:1] >> s0[0]) ^ + // q[2:1] << (64 - s0[1]) + t0 = $XOR(t0, t1); + // q[2:1] << (64 - s0[0]) + t1 = $SLL64::<{ 8 - 1 }>(t1); + // sigma1(q[2:1]) + t0 = $XOR(t0, t2); + t0 = $XOR(t0, t1); + // q[15:14] >> s1[2] + t3 = $SRL64::<6>(x[7]); + // q[15:14] >> (64 - s1[1]) + t2 = $SLL64::<{ 64 - 61 }>(x[7]); + // q[1:0] + sigma0(q[2:1]) + x[0] = $ADD64(x[0], t0); + // q[15:14] >> s1[0] + t1 = $SRL64::<19>(x[7]); + // q[15:14] >> s1[2] ^ + // q[15:14] >> (64 - s1[1]) + t3 = $XOR(t3, t2); + // q[15:14] >> (64 - s1[0]) + t2 = $SLL64::<{ 61 - 19 }>(t2); + // q[15:14] >> s1[2] ^ + // q[15:14] >> (64 - s1[1] ^ + // q[15:14] >> s1[0] + t3 = $XOR(t3, t1); + // q[15:14] >> s1[1] + t1 = $SRL64::<{ 61 - 19 }>(t1); + // sigma1(q[15:14]) + t3 = $XOR(t3, t2); + t3 = $XOR(t3, t1); + + // q[1:0] + q[10:9] + sigma1(q[15:14]) + sigma0(q[2:1]) + x[0] = $ADD64(x[0], t3); + + // rotate + let temp = x[0]; + x[0] = x[1]; + x[1] = x[2]; + x[2] = x[3]; + x[3] = x[4]; + x[4] = x[5]; + x[5] = x[6]; + x[6] = x[7]; + x[7] = temp; + + $ADD64(x[7], $LOAD(k64_p)) + } + }; +} + +fn_sha512_update_x!(sha512_update_x_avx, __m128i, { + LOAD = _mm_loadu_si128, + ADD64 = _mm_add_epi64, + ALIGNR8 = _mm_alignr_epi8, + SRL64 = _mm_srli_epi64, + SLL64 = _mm_slli_epi64, + XOR = _mm_xor_si128, +}); + +fn_sha512_update_x!(sha512_update_x_avx2, __m256i, { + LOAD = _mm256_loadu_si256, + ADD64 = _mm256_add_epi64, + ALIGNR8 = _mm256_alignr_epi8, + SRL64 = _mm256_srli_epi64, + SLL64 = _mm256_slli_epi64, + XOR = _mm256_xor_si256, +}); + +#[inline(always)] +unsafe fn sha512_compress_x86_64_avx(state: &mut [u64; 8], block: &[u8; 128]) { + let mut ms = Default::default(); + let mut x = [_mm_setzero_si128(); MS_VEC_NUM_AVX]; + + let mut current_state = *state; + load_data_avx(&mut x, &mut ms, block.as_ptr() as *const _); + rounds_0_63_avx(&mut current_state, &mut x, &mut ms); + rounds_64_79(&mut current_state, &ms); + accumulate_state(state, ¤t_state); +} + +#[inline(always)] +unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + let mut start_block = 0; + + if blocks.len() & 0b1 != 0 { + sha512_compress_x86_64_avx(state, &blocks[0]); + start_block += 1; + } + + let mut ms: MsgSchedule = Default::default(); + let mut t2: RoundStates = [0u64; SHA512_ROUNDS_NUM]; + let mut x = [_mm256_setzero_si256(); MS_VEC_NUM_AVX2]; + + for i in (start_block..blocks.len()).step_by(2) { + load_data_avx2(&mut x, &mut ms, &mut t2, blocks.as_ptr().add(i) as *const _); + + // First block + let mut current_state = *state; + rounds_0_63_avx2(&mut current_state, &mut x, &mut ms, &mut t2); + rounds_64_79(&mut current_state, &ms); + accumulate_state(state, ¤t_state); + + // Second block + current_state = *state; + process_second_block(&mut current_state, t2); + accumulate_state(state, ¤t_state); + } +} + +#[allow(clippy::cast_ptr_alignment)] +#[target_feature(enable = "avx,avx2,sse2,sse3")] +unsafe fn digest_blocks(state: &mut State, blocks: &[[u8; 128]]) { + sha512_compress_x86_64_avx2(state, blocks); +} + +cpufeatures::new!(avx2_cpuid, "avx", "avx2", "sse2", "sse3"); + +pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 + // after stabilization + if avx2_cpuid::get() { + unsafe { + digest_blocks(state, blocks); + } + } else { + super::soft::compress(state, blocks); + } +} From 6fa4b0dec5e759083f501a09a964dfaefd5e3d22 Mon Sep 17 00:00:00 2001 From: Ivan Kalinin Date: Mon, 6 Sep 2021 12:30:41 +0300 Subject: [PATCH 2/6] sha2: Remove loop unrolling in rounds --- sha2/src/sha512/x86.rs | 73 +++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 47 deletions(-) diff --git a/sha2/src/sha512/x86.rs b/sha2/src/sha512/x86.rs index 69bdaa55d..7d9b32166 100644 --- a/sha2/src/sha512/x86.rs +++ b/sha2/src/sha512/x86.rs @@ -102,19 +102,15 @@ unsafe fn rounds_0_63_avx( let mut k64_idx: usize = SHA512_BLOCK_WORDS_NUM; for _ in 0..4 { - macro_rules! unrolled_iterations { - ($($j:literal),*) => {$( - let y = sha512_update_x_avx(x, &K64[k64_idx] as *const u64 as *const _); + for j in 0..8 { + let y = sha512_update_x_avx(x, &K64[k64_idx] as *const u64 as *const _); - sha_round(current_state, ms[WORDS_IN_VEC_AVX * $j]); - sha_round(current_state, ms[WORDS_IN_VEC_AVX * $j + 1]); + sha_round(current_state, ms[WORDS_IN_VEC_AVX * j]); + sha_round(current_state, ms[WORDS_IN_VEC_AVX * j + 1]); - _mm_store_si128(&mut ms[WORDS_IN_VEC_AVX * $j] as *const u64 as *mut _, y); - k64_idx += WORDS_IN_VEC_AVX; - )*}; + _mm_store_si128(&mut ms[WORDS_IN_VEC_AVX * j] as *const u64 as *mut _, y); + k64_idx += WORDS_IN_VEC_AVX; } - - unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); } } @@ -128,55 +124,38 @@ unsafe fn rounds_0_63_avx2( let mut k64x2_idx: usize = 2 * SHA512_BLOCK_WORDS_NUM; for i in 1..5 { - macro_rules! unrolled_iterations { - ($($j:literal),*) => {$( - let y = sha512_update_x_avx2(x, &K64X4[k64x2_idx] as *const u64 as *const _); - - sha_round(current_state, ms[WORDS_IN_128_BIT_VEC * $j]); - sha_round(current_state, ms[WORDS_IN_128_BIT_VEC * $j + 1]); - - _mm_store_si128( - &mut ms[WORDS_IN_128_BIT_VEC * $j] as *mut u64 as *mut _, - _mm256_extracti128_si256::<0>(y), - ); - _mm_store_si128( - &mut t2[(16 * i) + WORDS_IN_128_BIT_VEC * $j] as *mut u64 as *mut _, - _mm256_extracti128_si256::<1>(y), - ); - - k64x2_idx += WORDS_IN_VEC_AVX2; - )*}; - } + for j in 0..8 { + let y = sha512_update_x_avx2(x, &K64X4[k64x2_idx] as *const u64 as *const _); + + sha_round(current_state, ms[WORDS_IN_128_BIT_VEC * j]); + sha_round(current_state, ms[WORDS_IN_128_BIT_VEC * j + 1]); + + _mm_store_si128( + &mut ms[WORDS_IN_128_BIT_VEC * j] as *mut u64 as *mut _, + _mm256_extracti128_si256::<0>(y), + ); + _mm_store_si128( + &mut t2[(16 * i) + WORDS_IN_128_BIT_VEC * j] as *mut u64 as *mut _, + _mm256_extracti128_si256::<1>(y), + ); - unrolled_iterations!(0, 1, 2, 3, 4, 5, 6, 7); + k64x2_idx += WORDS_IN_VEC_AVX2; + } } } #[inline(always)] unsafe fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) { - macro_rules! unrolled_iterations { - ($($i:literal),*) => {$( - sha_round(current_state, ms[$i & 0xf]); - )*}; + for i in 64..80 { + sha_round(current_state, ms[i & 0xf]); } - - unrolled_iterations!(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79); } #[inline(always)] unsafe fn process_second_block(current_state: &mut State, t2: RoundStates) { - macro_rules! unrolled_iterations { - ($($i:literal),*) => {$( - sha_round(current_state, t2[$i]); - )*}; + for t2 in t2 { + sha_round(current_state, t2); } - - unrolled_iterations!( - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, - 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, - 71, 72, 73, 74, 75, 76, 77, 78, 79 - ); } #[inline(always)] From d5a6e14e8020ee94d67295fe45fa6b9eac5673db Mon Sep 17 00:00:00 2001 From: Ivan Kalinin Date: Mon, 6 Sep 2021 13:05:18 +0300 Subject: [PATCH 3/6] sha2: Refactor --- sha2/src/sha512/x86.rs | 179 ++++++++++++++++++----------------------- 1 file changed, 80 insertions(+), 99 deletions(-) diff --git a/sha2/src/sha512/x86.rs b/sha2/src/sha512/x86.rs index 7d9b32166..a00a80877 100644 --- a/sha2/src/sha512/x86.rs +++ b/sha2/src/sha512/x86.rs @@ -11,28 +11,64 @@ use core::arch::x86_64::*; use crate::consts::{K64, K64X4}; -const SHA512_BLOCK_BYTE_LEN: usize = 128; -const SHA512_ROUNDS_NUM: usize = 80; -const SHA512_HASH_BYTE_LEN: usize = 64; -const SHA512_HASH_WORDS_NUM: usize = SHA512_HASH_BYTE_LEN / size_of::(); -const SHA512_BLOCK_WORDS_NUM: usize = SHA512_BLOCK_BYTE_LEN / size_of::(); +cpufeatures::new!(avx2_cpuid, "avx", "avx2", "sse2", "sse3"); -const MS_VEC_NUM_AVX: usize = SHA512_BLOCK_BYTE_LEN / size_of::<__m128i>(); -const MS_VEC_NUM_AVX2: usize = (2 * SHA512_BLOCK_BYTE_LEN) / size_of::<__m256i>(); -const WORDS_IN_128_BIT_VEC: usize = 16 / size_of::(); -const WORDS_IN_VEC_AVX: usize = size_of::<__m128i>() / size_of::(); -const WORDS_IN_VEC_AVX2: usize = size_of::<__m256i>() / size_of::(); +pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 + // after stabilization + if avx2_cpuid::get() { + unsafe { + sha512_compress_x86_64_avx2(state, blocks); + } + } else { + super::soft::compress(state, blocks); + } +} -type State = [u64; SHA512_HASH_WORDS_NUM]; -type MsgSchedule = [u64; SHA512_BLOCK_WORDS_NUM]; -type RoundStates = [u64; SHA512_ROUNDS_NUM]; +#[target_feature(enable = "avx,avx2,sse2,sse3")] +unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + let mut start_block = 0; + + if blocks.len() & 0b1 != 0 { + sha512_compress_x86_64_avx(state, &blocks[0]); + start_block += 1; + } + + let mut ms: MsgSchedule = Default::default(); + let mut t2: RoundStates = [0u64; SHA512_ROUNDS_NUM]; + let mut x = [_mm256_setzero_si256(); 8]; + + for i in (start_block..blocks.len()).step_by(2) { + load_data_avx2(&mut x, &mut ms, &mut t2, blocks.as_ptr().add(i) as *const _); + + // First block + let mut current_state = *state; + rounds_0_63_avx2(&mut current_state, &mut x, &mut ms, &mut t2); + rounds_64_79(&mut current_state, &ms); + accumulate_state(state, ¤t_state); + + // Second block + current_state = *state; + process_second_block(&mut current_state, &t2); + accumulate_state(state, ¤t_state); + } +} #[inline(always)] -unsafe fn load_data_avx( - x: &mut [__m128i; MS_VEC_NUM_AVX], - ms: &mut MsgSchedule, - data: *const __m128i, -) { +unsafe fn sha512_compress_x86_64_avx(state: &mut [u64; 8], block: &[u8; 128]) { + let mut ms = Default::default(); + let mut x = [_mm_setzero_si128(); 8]; + + // Reduced to single iteration + let mut current_state = *state; + load_data_avx(&mut x, &mut ms, block.as_ptr() as *const _); + rounds_0_63_avx(&mut current_state, &mut x, &mut ms); + rounds_64_79(&mut current_state, &ms); + accumulate_state(state, ¤t_state); +} + +#[inline(always)] +unsafe fn load_data_avx(x: &mut [__m128i; 8], ms: &mut MsgSchedule, data: *const __m128i) { #[allow(non_snake_case)] let MASK = _mm_setr_epi32(0x04050607, 0x00010203, 0x0c0d0e0f, 0x08090a0b); @@ -43,10 +79,10 @@ unsafe fn load_data_avx( let y = _mm_add_epi64( x[$i], - _mm_loadu_si128(&K64[WORDS_IN_VEC_AVX * $i] as *const u64 as *const _), + _mm_loadu_si128(&K64[2 * $i] as *const u64 as *const _), ); - _mm_store_si128(&mut ms[WORDS_IN_VEC_AVX * $i] as *mut u64 as *mut _, y); + _mm_store_si128(&mut ms[2 * $i] as *mut u64 as *mut _, y); )*}; } @@ -55,7 +91,7 @@ unsafe fn load_data_avx( #[inline(always)] unsafe fn load_data_avx2( - x: &mut [__m256i; MS_VEC_NUM_AVX2], + x: &mut [__m256i; 8], ms: &mut MsgSchedule, t2: &mut RoundStates, data: *const __m128i, @@ -94,22 +130,18 @@ unsafe fn load_data_avx2( } #[inline(always)] -unsafe fn rounds_0_63_avx( - current_state: &mut State, - x: &mut [__m128i; MS_VEC_NUM_AVX], - ms: &mut MsgSchedule, -) { +unsafe fn rounds_0_63_avx(current_state: &mut State, x: &mut [__m128i; 8], ms: &mut MsgSchedule) { let mut k64_idx: usize = SHA512_BLOCK_WORDS_NUM; for _ in 0..4 { for j in 0..8 { let y = sha512_update_x_avx(x, &K64[k64_idx] as *const u64 as *const _); - sha_round(current_state, ms[WORDS_IN_VEC_AVX * j]); - sha_round(current_state, ms[WORDS_IN_VEC_AVX * j + 1]); + sha_round(current_state, ms[2 * j]); + sha_round(current_state, ms[2 * j + 1]); - _mm_store_si128(&mut ms[WORDS_IN_VEC_AVX * j] as *const u64 as *mut _, y); - k64_idx += WORDS_IN_VEC_AVX; + _mm_store_si128(&mut ms[2 * j] as *const u64 as *mut _, y); + k64_idx += 2; } } } @@ -117,29 +149,29 @@ unsafe fn rounds_0_63_avx( #[inline(always)] unsafe fn rounds_0_63_avx2( current_state: &mut State, - x: &mut [__m256i; MS_VEC_NUM_AVX2], + x: &mut [__m256i; 8], ms: &mut MsgSchedule, t2: &mut RoundStates, ) { - let mut k64x2_idx: usize = 2 * SHA512_BLOCK_WORDS_NUM; + let mut k64x4_idx: usize = 2 * SHA512_BLOCK_WORDS_NUM; for i in 1..5 { for j in 0..8 { - let y = sha512_update_x_avx2(x, &K64X4[k64x2_idx] as *const u64 as *const _); + let y = sha512_update_x_avx2(x, &K64X4[k64x4_idx] as *const u64 as *const _); - sha_round(current_state, ms[WORDS_IN_128_BIT_VEC * j]); - sha_round(current_state, ms[WORDS_IN_128_BIT_VEC * j + 1]); + sha_round(current_state, ms[2 * j]); + sha_round(current_state, ms[2 * j + 1]); _mm_store_si128( - &mut ms[WORDS_IN_128_BIT_VEC * j] as *mut u64 as *mut _, + &mut ms[2 * j] as *mut u64 as *mut _, _mm256_extracti128_si256::<0>(y), ); _mm_store_si128( - &mut t2[(16 * i) + WORDS_IN_128_BIT_VEC * j] as *mut u64 as *mut _, + &mut t2[(16 * i) + 2 * j] as *mut u64 as *mut _, _mm256_extracti128_si256::<1>(y), ); - k64x2_idx += WORDS_IN_VEC_AVX2; + k64x4_idx += 4; } } } @@ -152,9 +184,9 @@ unsafe fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) { } #[inline(always)] -unsafe fn process_second_block(current_state: &mut State, t2: RoundStates) { +unsafe fn process_second_block(current_state: &mut State, t2: &RoundStates) { for t2 in t2 { - sha_round(current_state, t2); + sha_round(current_state, *t2); } } @@ -311,63 +343,12 @@ fn_sha512_update_x!(sha512_update_x_avx2, __m256i, { XOR = _mm256_xor_si256, }); -#[inline(always)] -unsafe fn sha512_compress_x86_64_avx(state: &mut [u64; 8], block: &[u8; 128]) { - let mut ms = Default::default(); - let mut x = [_mm_setzero_si128(); MS_VEC_NUM_AVX]; - - let mut current_state = *state; - load_data_avx(&mut x, &mut ms, block.as_ptr() as *const _); - rounds_0_63_avx(&mut current_state, &mut x, &mut ms); - rounds_64_79(&mut current_state, &ms); - accumulate_state(state, ¤t_state); -} - -#[inline(always)] -unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) { - let mut start_block = 0; - - if blocks.len() & 0b1 != 0 { - sha512_compress_x86_64_avx(state, &blocks[0]); - start_block += 1; - } - - let mut ms: MsgSchedule = Default::default(); - let mut t2: RoundStates = [0u64; SHA512_ROUNDS_NUM]; - let mut x = [_mm256_setzero_si256(); MS_VEC_NUM_AVX2]; - - for i in (start_block..blocks.len()).step_by(2) { - load_data_avx2(&mut x, &mut ms, &mut t2, blocks.as_ptr().add(i) as *const _); - - // First block - let mut current_state = *state; - rounds_0_63_avx2(&mut current_state, &mut x, &mut ms, &mut t2); - rounds_64_79(&mut current_state, &ms); - accumulate_state(state, ¤t_state); - - // Second block - current_state = *state; - process_second_block(&mut current_state, t2); - accumulate_state(state, ¤t_state); - } -} - -#[allow(clippy::cast_ptr_alignment)] -#[target_feature(enable = "avx,avx2,sse2,sse3")] -unsafe fn digest_blocks(state: &mut State, blocks: &[[u8; 128]]) { - sha512_compress_x86_64_avx2(state, blocks); -} - -cpufeatures::new!(avx2_cpuid, "avx", "avx2", "sse2", "sse3"); +type State = [u64; SHA512_HASH_WORDS_NUM]; +type MsgSchedule = [u64; SHA512_BLOCK_WORDS_NUM]; +type RoundStates = [u64; SHA512_ROUNDS_NUM]; -pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { - // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 - // after stabilization - if avx2_cpuid::get() { - unsafe { - digest_blocks(state, blocks); - } - } else { - super::soft::compress(state, blocks); - } -} +const SHA512_BLOCK_BYTE_LEN: usize = 128; +const SHA512_ROUNDS_NUM: usize = 80; +const SHA512_HASH_BYTE_LEN: usize = 64; +const SHA512_HASH_WORDS_NUM: usize = SHA512_HASH_BYTE_LEN / size_of::(); +const SHA512_BLOCK_WORDS_NUM: usize = SHA512_BLOCK_BYTE_LEN / size_of::(); From 16e0cba96d138977269c3221479f8cd9ab83c8a8 Mon Sep 17 00:00:00 2001 From: Ivan Kalinin Date: Mon, 6 Sep 2021 13:50:48 +0300 Subject: [PATCH 4/6] sha2: Fix build with asm feature --- sha2/src/sha512.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sha2/src/sha512.rs b/sha2/src/sha512.rs index 078d4b786..22f141e43 100644 --- a/sha2/src/sha512.rs +++ b/sha2/src/sha512.rs @@ -236,7 +236,7 @@ cfg_if::cfg_if! { mod soft; #[cfg(feature = "asm")] mod soft { - fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { + pub(crate) fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { sha2_asm::compress512(state, blocks); } } From b2f6d86af3521153cf38c327509d85d390f4dbea Mon Sep 17 00:00:00 2001 From: Ivan Kalinin Date: Mon, 6 Sep 2021 14:52:07 +0300 Subject: [PATCH 5/6] sha2: Fix build on 1.41 --- sha2/src/sha512/x86.rs | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/sha2/src/sha512/x86.rs b/sha2/src/sha512/x86.rs index a00a80877..fedf3e664 100644 --- a/sha2/src/sha512/x86.rs +++ b/sha2/src/sha512/x86.rs @@ -106,8 +106,8 @@ unsafe fn load_data_avx2( macro_rules! unrolled_iterations { ($($i:literal),*) => {$( - x[$i] = _mm256_insertf128_si256::<1>(x[$i], _mm_loadu_si128(data.add($i) as *const _)); - x[$i] = _mm256_insertf128_si256::<0>(x[$i], _mm_loadu_si128(data.add($i + 1) as *const _)); + x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i) as *const _), 1); + x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i + 1) as *const _), 0); x[$i] = _mm256_shuffle_epi8(x[$i], MASK); let y = _mm256_add_epi64( @@ -117,11 +117,11 @@ unsafe fn load_data_avx2( _mm_store_si128( &mut ms[2 * $i] as *mut u64 as *mut _, - _mm256_extracti128_si256::<0>(y), + _mm256_extracti128_si256(y, 0), ); _mm_store_si128( &mut t2[2 * $i] as *mut u64 as *mut _, - _mm256_extracti128_si256::<1>(y), + _mm256_extracti128_si256(y, 1), ); )*}; } @@ -164,11 +164,11 @@ unsafe fn rounds_0_63_avx2( _mm_store_si128( &mut ms[2 * j] as *mut u64 as *mut _, - _mm256_extracti128_si256::<0>(y), + _mm256_extracti128_si256(y, 0), ); _mm_store_si128( &mut t2[(16 * i) + 2 * j] as *mut u64 as *mut _, - _mm256_extracti128_si256::<1>(y), + _mm256_extracti128_si256(y, 1), ); k64x4_idx += 4; @@ -185,7 +185,7 @@ unsafe fn rounds_64_79(current_state: &mut State, ms: &MsgSchedule) { #[inline(always)] unsafe fn process_second_block(current_state: &mut State, t2: &RoundStates) { - for t2 in t2 { + for t2 in t2.iter() { sha_round(current_state, *t2); } } @@ -258,50 +258,50 @@ macro_rules! fn_sha512_update_x { }) => { unsafe fn $name(x: &mut [$ty; 8], k64_p: *const $ty) -> $ty { // q[2:1] - let mut t0 = $ALIGNR8::<8>(x[1], x[0]); + let mut t0 = $ALIGNR8(x[1], x[0], 8); // q[10:9] - let mut t3 = $ALIGNR8::<8>(x[5], x[4]); + let mut t3 = $ALIGNR8(x[5], x[4], 8); // q[2:1] >> s0[0] - let mut t2 = $SRL64::<1>(t0); + let mut t2 = $SRL64(t0, 1); // q[1:0] + q[10:9] x[0] = $ADD64(x[0], t3); // q[2:1] >> s0[2] - t3 = $SRL64::<7>(t0); + t3 = $SRL64(t0, 7); // q[2:1] << (64 - s0[1]) - let mut t1 = $SLL64::<{ 64 - 8 }>(t0); + let mut t1 = $SLL64(t0, 64 - 8); // (q[2:1] >> s0[2]) ^ // (q[2:1] >> s0[0]) t0 = $XOR(t3, t2); // q[2:1] >> s0[1] - t2 = $SRL64::<{ 8 - 1 }>(t2); + t2 = $SRL64(t2, 8 - 1); // (q[2:1] >> s0[2]) ^ // (q[2:1] >> s0[0]) ^ // q[2:1] << (64 - s0[1]) t0 = $XOR(t0, t1); // q[2:1] << (64 - s0[0]) - t1 = $SLL64::<{ 8 - 1 }>(t1); + t1 = $SLL64(t1, 8 - 1); // sigma1(q[2:1]) t0 = $XOR(t0, t2); t0 = $XOR(t0, t1); // q[15:14] >> s1[2] - t3 = $SRL64::<6>(x[7]); + t3 = $SRL64(x[7], 6); // q[15:14] >> (64 - s1[1]) - t2 = $SLL64::<{ 64 - 61 }>(x[7]); + t2 = $SLL64(x[7], 64 - 61); // q[1:0] + sigma0(q[2:1]) x[0] = $ADD64(x[0], t0); // q[15:14] >> s1[0] - t1 = $SRL64::<19>(x[7]); + t1 = $SRL64(x[7], 19); // q[15:14] >> s1[2] ^ // q[15:14] >> (64 - s1[1]) t3 = $XOR(t3, t2); // q[15:14] >> (64 - s1[0]) - t2 = $SLL64::<{ 61 - 19 }>(t2); + t2 = $SLL64(t2, 61 - 19); // q[15:14] >> s1[2] ^ // q[15:14] >> (64 - s1[1] ^ // q[15:14] >> s1[0] t3 = $XOR(t3, t1); // q[15:14] >> s1[1] - t1 = $SRL64::<{ 61 - 19 }>(t1); + t1 = $SRL64(t1, 61 - 19); // sigma1(q[15:14]) t3 = $XOR(t3, t2); t3 = $XOR(t3, t1); From 03a3ebda81983a03c103edfc73cac2171eb0719b Mon Sep 17 00:00:00 2001 From: Ivan Kalinin Date: Wed, 8 Sep 2021 15:12:23 +0300 Subject: [PATCH 6/6] sha2: Reduce memory pressure --- sha2/src/consts.rs | 50 ------------------------------------------ sha2/src/sha512/x86.rs | 30 ++++++++++++------------- 2 files changed, 14 insertions(+), 66 deletions(-) diff --git a/sha2/src/consts.rs b/sha2/src/consts.rs index 4aaff0f41..f126dc616 100644 --- a/sha2/src/consts.rs +++ b/sha2/src/consts.rs @@ -163,56 +163,6 @@ pub const K64X2: [[u64; 2]; 40] = [ [K64[79], K64[78]], ]; -macro_rules! dup_array { - ([$([$a:expr, $b:expr]),*,]) => {[ - $($b, $a, $b, $a),*, - ]} -} - -/// Constants necessary for SHA-512 family of digests. -pub const K64X4: [u64; 160] = dup_array!([ - [K64[1], K64[0]], - [K64[3], K64[2]], - [K64[5], K64[4]], - [K64[7], K64[6]], - [K64[9], K64[8]], - [K64[11], K64[10]], - [K64[13], K64[12]], - [K64[15], K64[14]], - [K64[17], K64[16]], - [K64[19], K64[18]], - [K64[21], K64[20]], - [K64[23], K64[22]], - [K64[25], K64[24]], - [K64[27], K64[26]], - [K64[29], K64[28]], - [K64[31], K64[30]], - [K64[33], K64[32]], - [K64[35], K64[34]], - [K64[37], K64[36]], - [K64[39], K64[38]], - [K64[41], K64[40]], - [K64[43], K64[42]], - [K64[45], K64[44]], - [K64[47], K64[46]], - [K64[49], K64[48]], - [K64[51], K64[50]], - [K64[53], K64[52]], - [K64[55], K64[54]], - [K64[57], K64[56]], - [K64[59], K64[58]], - [K64[61], K64[60]], - [K64[63], K64[62]], - [K64[65], K64[64]], - [K64[67], K64[66]], - [K64[69], K64[68]], - [K64[71], K64[70]], - [K64[73], K64[72]], - [K64[75], K64[74]], - [K64[77], K64[76]], - [K64[79], K64[78]], -]); - pub static H224: [u32; STATE_LEN] = [ 0xc1059ed8, 0x367cd507, 0x3070dd17, 0xf70e5939, 0xffc00b31, 0x68581511, 0x64f98fa7, 0xbefa4fa4, ]; diff --git a/sha2/src/sha512/x86.rs b/sha2/src/sha512/x86.rs index fedf3e664..09b3ceeaf 100644 --- a/sha2/src/sha512/x86.rs +++ b/sha2/src/sha512/x86.rs @@ -9,9 +9,9 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; -use crate::consts::{K64, K64X4}; +use crate::consts::K64; -cpufeatures::new!(avx2_cpuid, "avx", "avx2", "sse2", "sse3"); +cpufeatures::new!(avx2_cpuid, "avx2"); pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { // TODO: Replace with https://github.com/rust-lang/rfcs/pull/2725 @@ -25,7 +25,7 @@ pub fn compress(state: &mut [u64; 8], blocks: &[[u8; 128]]) { } } -#[target_feature(enable = "avx,avx2,sse2,sse3")] +#[target_feature(enable = "avx2")] unsafe fn sha512_compress_x86_64_avx2(state: &mut [u64; 8], blocks: &[[u8; 128]]) { let mut start_block = 0; @@ -110,10 +110,9 @@ unsafe fn load_data_avx2( x[$i] = _mm256_insertf128_si256(x[$i], _mm_loadu_si128(data.add($i + 1) as *const _), 0); x[$i] = _mm256_shuffle_epi8(x[$i], MASK); - let y = _mm256_add_epi64( - x[$i], - _mm256_loadu_si256(&K64X4[4 * $i] as *const u64 as *const _), - ); + + let t = _mm_loadu_si128(K64.as_ptr().add($i * 2) as *const u64 as *const _); + let y = _mm256_add_epi64(x[$i], _mm256_set_m128i(t, t)); _mm_store_si128( &mut ms[2 * $i] as *mut u64 as *mut _, @@ -135,7 +134,8 @@ unsafe fn rounds_0_63_avx(current_state: &mut State, x: &mut [__m128i; 8], ms: & for _ in 0..4 { for j in 0..8 { - let y = sha512_update_x_avx(x, &K64[k64_idx] as *const u64 as *const _); + let k64 = _mm_loadu_si128(&K64[k64_idx] as *const u64 as *const _); + let y = sha512_update_x_avx(x, k64); sha_round(current_state, ms[2 * j]); sha_round(current_state, ms[2 * j + 1]); @@ -153,11 +153,12 @@ unsafe fn rounds_0_63_avx2( ms: &mut MsgSchedule, t2: &mut RoundStates, ) { - let mut k64x4_idx: usize = 2 * SHA512_BLOCK_WORDS_NUM; + let mut k64x4_idx: usize = SHA512_BLOCK_WORDS_NUM; for i in 1..5 { for j in 0..8 { - let y = sha512_update_x_avx2(x, &K64X4[k64x4_idx] as *const u64 as *const _); + let t = _mm_loadu_si128(K64.as_ptr().add(k64x4_idx) as *const u64 as *const _); + let y = sha512_update_x_avx2(x, _mm256_set_m128i(t, t)); sha_round(current_state, ms[2 * j]); sha_round(current_state, ms[2 * j + 1]); @@ -171,7 +172,7 @@ unsafe fn rounds_0_63_avx2( _mm256_extracti128_si256(y, 1), ); - k64x4_idx += 4; + k64x4_idx += 2; } } } @@ -249,14 +250,13 @@ unsafe fn accumulate_state(dst: &mut State, src: &State) { macro_rules! fn_sha512_update_x { ($name:ident, $ty:ident, { - LOAD = $LOAD:ident, ADD64 = $ADD64:ident, ALIGNR8 = $ALIGNR8:ident, SRL64 = $SRL64:ident, SLL64 = $SLL64:ident, XOR = $XOR:ident, }) => { - unsafe fn $name(x: &mut [$ty; 8], k64_p: *const $ty) -> $ty { + unsafe fn $name(x: &mut [$ty; 8], k64: $ty) -> $ty { // q[2:1] let mut t0 = $ALIGNR8(x[1], x[0], 8); // q[10:9] @@ -320,13 +320,12 @@ macro_rules! fn_sha512_update_x { x[6] = x[7]; x[7] = temp; - $ADD64(x[7], $LOAD(k64_p)) + $ADD64(x[7], k64) } }; } fn_sha512_update_x!(sha512_update_x_avx, __m128i, { - LOAD = _mm_loadu_si128, ADD64 = _mm_add_epi64, ALIGNR8 = _mm_alignr_epi8, SRL64 = _mm_srli_epi64, @@ -335,7 +334,6 @@ fn_sha512_update_x!(sha512_update_x_avx, __m128i, { }); fn_sha512_update_x!(sha512_update_x_avx2, __m256i, { - LOAD = _mm256_loadu_si256, ADD64 = _mm256_add_epi64, ALIGNR8 = _mm256_alignr_epi8, SRL64 = _mm256_srli_epi64,