use crate::{Block, StreamClosure, Unsigned, STATE_WORDS}; use cipher::{ consts::{U4, U64}, BlockSizeUser, ParBlocks, ParBlocksSizeUser, StreamBackend, }; use core::marker::PhantomData; #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; /// Number of blocks processed in parallel. const PAR_BLOCKS: usize = 4; /// Number of `__m256i` to store parallel blocks. const N: usize = PAR_BLOCKS / 2; #[inline] #[target_feature(enable = "avx2")] pub(crate) unsafe fn inner(state: &mut [u32; STATE_WORDS], f: F) where R: Unsigned, F: StreamClosure, { let state_ptr = state.as_ptr() as *const __m128i; let v = [ _mm256_broadcastsi128_si256(_mm_loadu_si128(state_ptr.add(0))), _mm256_broadcastsi128_si256(_mm_loadu_si128(state_ptr.add(1))), _mm256_broadcastsi128_si256(_mm_loadu_si128(state_ptr.add(2))), ]; let mut c = _mm256_broadcastsi128_si256(_mm_loadu_si128(state_ptr.add(3))); c = _mm256_add_epi32(c, _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 0)); let mut ctr = [c; N]; for i in 0..N { ctr[i] = c; c = _mm256_add_epi32(c, _mm256_set_epi32(0, 0, 0, 2, 0, 0, 0, 2)); } let mut backend = Backend:: { v, ctr, _pd: PhantomData, }; f.call(&mut backend); state[12] = _mm256_extract_epi32(backend.ctr[0], 0) as u32; } struct Backend { v: [__m256i; 3], ctr: [__m256i; N], _pd: PhantomData, } impl BlockSizeUser for Backend { type BlockSize = U64; } impl ParBlocksSizeUser for Backend { type ParBlocksSize = U4; } impl StreamBackend for Backend { #[inline(always)] fn gen_ks_block(&mut self, block: &mut Block) { unsafe { let res = rounds::(&self.v, &self.ctr); for c in self.ctr.iter_mut() { *c = _mm256_add_epi32(*c, _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 1)); } let res0: [__m128i; 8] = core::mem::transmute(res[0]); let block_ptr = block.as_mut_ptr() as *mut __m128i; for i in 0..4 { _mm_storeu_si128(block_ptr.add(i), res0[2 * i]); } } } #[inline(always)] fn gen_par_ks_blocks(&mut self, blocks: &mut ParBlocks) { unsafe { let vs = rounds::(&self.v, &self.ctr); let pb = PAR_BLOCKS as i32; for c in self.ctr.iter_mut() { *c = _mm256_add_epi32(*c, _mm256_set_epi32(0, 0, 0, pb, 0, 0, 0, pb)); } let mut block_ptr = blocks.as_mut_ptr() as *mut __m128i; for v in vs { let t: [__m128i; 8] = core::mem::transmute(v); for i in 0..4 { _mm_storeu_si128(block_ptr.add(i), t[2 * i]); _mm_storeu_si128(block_ptr.add(4 + i), t[2 * i + 1]); } block_ptr = block_ptr.add(8); } } } } #[inline] #[target_feature(enable = "avx2")] unsafe fn rounds(v: &[__m256i; 3], c: &[__m256i; N]) -> [[__m256i; 4]; N] { let mut vs: [[__m256i; 4]; N] = [[_mm256_setzero_si256(); 4]; N]; for i in 0..N { vs[i] = [v[0], v[1], v[2], c[i]]; } for _ in 0..R::USIZE { double_quarter_round(&mut vs); } for i in 0..N { for j in 0..3 { vs[i][j] = _mm256_add_epi32(vs[i][j], v[j]); } vs[i][3] = _mm256_add_epi32(vs[i][3], c[i]); } vs } #[inline] #[target_feature(enable = "avx2")] unsafe fn double_quarter_round(v: &mut [[__m256i; 4]; N]) { add_xor_rot(v); rows_to_cols(v); add_xor_rot(v); cols_to_rows(v); } /// The goal of this function is to transform the state words from: /// ```text /// [a0, a1, a2, a3] [ 0, 1, 2, 3] /// [b0, b1, b2, b3] == [ 4, 5, 6, 7] /// [c0, c1, c2, c3] [ 8, 9, 10, 11] /// [d0, d1, d2, d3] [12, 13, 14, 15] /// ``` /// /// to: /// ```text /// [a0, a1, a2, a3] [ 0, 1, 2, 3] /// [b1, b2, b3, b0] == [ 5, 6, 7, 4] /// [c2, c3, c0, c1] [10, 11, 8, 9] /// [d3, d0, d1, d2] [15, 12, 13, 14] /// ``` /// /// so that we can apply [`add_xor_rot`] to the resulting columns, and have it compute the /// "diagonal rounds" (as defined in RFC 7539) in parallel. In practice, this shuffle is /// non-optimal: the last state word to be altered in `add_xor_rot` is `b`, so the shuffle /// blocks on the result of `b` being calculated. /// /// We can optimize this by observing that the four quarter rounds in `add_xor_rot` are /// data-independent: they only access a single column of the state, and thus the order of /// the columns does not matter. We therefore instead shuffle the other three state words, /// to obtain the following equivalent layout: /// ```text /// [a3, a0, a1, a2] [ 3, 0, 1, 2] /// [b0, b1, b2, b3] == [ 4, 5, 6, 7] /// [c1, c2, c3, c0] [ 9, 10, 11, 8] /// [d2, d3, d0, d1] [14, 15, 12, 13] /// ``` /// /// See https://github.com/sneves/blake2-avx2/pull/4 for additional details. The earliest /// known occurrence of this optimization is in floodyberry's SSE4 ChaCha code from 2014: /// - https://github.com/floodyberry/chacha-opt/blob/0ab65cb99f5016633b652edebaf3691ceb4ff753/chacha_blocks_ssse3-64.S#L639-L643 #[inline] #[target_feature(enable = "avx2")] unsafe fn rows_to_cols(vs: &mut [[__m256i; 4]; N]) { // c >>>= 32; d >>>= 64; a >>>= 96; for [a, _, c, d] in vs { *c = _mm256_shuffle_epi32(*c, 0b_00_11_10_01); // _MM_SHUFFLE(0, 3, 2, 1) *d = _mm256_shuffle_epi32(*d, 0b_01_00_11_10); // _MM_SHUFFLE(1, 0, 3, 2) *a = _mm256_shuffle_epi32(*a, 0b_10_01_00_11); // _MM_SHUFFLE(2, 1, 0, 3) } } /// The goal of this function is to transform the state words from: /// ```text /// [a3, a0, a1, a2] [ 3, 0, 1, 2] /// [b0, b1, b2, b3] == [ 4, 5, 6, 7] /// [c1, c2, c3, c0] [ 9, 10, 11, 8] /// [d2, d3, d0, d1] [14, 15, 12, 13] /// ``` /// /// to: /// ```text /// [a0, a1, a2, a3] [ 0, 1, 2, 3] /// [b0, b1, b2, b3] == [ 4, 5, 6, 7] /// [c0, c1, c2, c3] [ 8, 9, 10, 11] /// [d0, d1, d2, d3] [12, 13, 14, 15] /// ``` /// /// reversing the transformation of [`rows_to_cols`]. #[inline] #[target_feature(enable = "avx2")] unsafe fn cols_to_rows(vs: &mut [[__m256i; 4]; N]) { // c <<<= 32; d <<<= 64; a <<<= 96; for [a, _, c, d] in vs { *c = _mm256_shuffle_epi32(*c, 0b_10_01_00_11); // _MM_SHUFFLE(2, 1, 0, 3) *d = _mm256_shuffle_epi32(*d, 0b_01_00_11_10); // _MM_SHUFFLE(1, 0, 3, 2) *a = _mm256_shuffle_epi32(*a, 0b_00_11_10_01); // _MM_SHUFFLE(0, 3, 2, 1) } } #[inline] #[target_feature(enable = "avx2")] unsafe fn add_xor_rot(vs: &mut [[__m256i; 4]; N]) { let rol16_mask = _mm256_set_epi64x( 0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302, 0x0d0c_0f0e_0908_0b0a, 0x0504_0706_0100_0302, ); let rol8_mask = _mm256_set_epi64x( 0x0e0d_0c0f_0a09_080b, 0x0605_0407_0201_0003, 0x0e0d_0c0f_0a09_080b, 0x0605_0407_0201_0003, ); // a += b; d ^= a; d <<<= (16, 16, 16, 16); for [a, b, _, d] in vs.iter_mut() { *a = _mm256_add_epi32(*a, *b); *d = _mm256_xor_si256(*d, *a); *d = _mm256_shuffle_epi8(*d, rol16_mask); } // c += d; b ^= c; b <<<= (12, 12, 12, 12); for [_, b, c, d] in vs.iter_mut() { *c = _mm256_add_epi32(*c, *d); *b = _mm256_xor_si256(*b, *c); *b = _mm256_xor_si256(_mm256_slli_epi32(*b, 12), _mm256_srli_epi32(*b, 20)); } // a += b; d ^= a; d <<<= (8, 8, 8, 8); for [a, b, _, d] in vs.iter_mut() { *a = _mm256_add_epi32(*a, *b); *d = _mm256_xor_si256(*d, *a); *d = _mm256_shuffle_epi8(*d, rol8_mask); } // c += d; b ^= c; b <<<= (7, 7, 7, 7); for [_, b, c, d] in vs.iter_mut() { *c = _mm256_add_epi32(*c, *d); *b = _mm256_xor_si256(*b, *c); *b = _mm256_xor_si256(_mm256_slli_epi32(*b, 7), _mm256_srli_epi32(*b, 25)); } }