//! Fixsliced implementations of AES-128, AES-192 and AES-256 (32-bit) //! adapted from the C implementation //! //! All implementations are fully bitsliced and do not rely on any //! Look-Up Table (LUT). //! //! See the paper at for more details. //! //! # Author (original C code) //! //! Alexandre Adomnicai, Nanyang Technological University, Singapore //! //! //! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission. #![allow(clippy::unreadable_literal)] use crate::Block; use cipher::{consts::U2, generic_array::GenericArray}; /// AES block batch size for this implementation pub(crate) type FixsliceBlocks = U2; pub(crate) type BatchBlocks = GenericArray; /// AES-128 round keys pub(crate) type FixsliceKeys128 = [u32; 88]; /// AES-192 round keys pub(crate) type FixsliceKeys192 = [u32; 104]; /// AES-256 round keys pub(crate) type FixsliceKeys256 = [u32; 120]; /// 256-bit internal state pub(crate) type State = [u32; 8]; /// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation. pub(crate) fn aes128_key_schedule(key: &[u8; 16]) -> FixsliceKeys128 { let mut rkeys = [0u32; 88]; bitslice(&mut rkeys[..8], key, key); let mut rk_off = 0; for rcon in 0..10 { memshift32(&mut rkeys, rk_off); rk_off += 8; sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); if rcon < 8 { add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); } else { add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8); add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7); add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5); add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4); } xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3)); } // Adjust to match fixslicing format #[cfg(aes_compact)] { for i in (8..88).step_by(16) { inv_shift_rows_1(&mut rkeys[i..(i + 8)]); } } #[cfg(not(aes_compact))] { for i in (8..72).step_by(32) { inv_shift_rows_1(&mut rkeys[i..(i + 8)]); inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); } inv_shift_rows_1(&mut rkeys[72..80]); } // Account for NOTs removed from sub_bytes for i in 1..11 { sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); } rkeys } /// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation. pub(crate) fn aes192_key_schedule(key: &[u8; 24]) -> FixsliceKeys192 { let mut rkeys = [0u32; 104]; let mut tmp = [0u32; 8]; bitslice(&mut rkeys[..8], &key[..16], &key[..16]); bitslice(&mut tmp, &key[8..], &key[8..]); let mut rcon = 0; let mut rk_off = 8; loop { for i in 0..8 { rkeys[rk_off + i] = (0x0f0f0f0f & (tmp[i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); } sub_bytes(&mut tmp); sub_bytes_nots(&mut tmp); add_round_constant_bit(&mut tmp, rcon); rcon += 1; for i in 0..8 { let mut ti = rkeys[rk_off + i]; ti ^= 0x30303030 & ror(tmp[i], ror_distance(1, 1)); ti ^= 0xc0c0c0c0 & (ti << 2); tmp[i] = ti; } rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); rk_off += 8; for i in 0..8 { let ui = tmp[i]; let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (ui << 4)); ti ^= 0x03030303 & (ui >> 6); tmp[i] = ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); } rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp); rk_off += 8; sub_bytes(&mut tmp); sub_bytes_nots(&mut tmp); add_round_constant_bit(&mut tmp, rcon); rcon += 1; for i in 0..8 { let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4)); ti ^= 0x03030303 & ror(tmp[i], ror_distance(1, 3)); rkeys[rk_off + i] = ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6)); } rk_off += 8; if rcon >= 8 { break; } for i in 0..8 { let ui = rkeys[(rk_off - 8) + i]; let mut ti = rkeys[(rk_off - 16) + i]; ti ^= 0x30303030 & (ui >> 2); ti ^= 0xc0c0c0c0 & (ti << 2); tmp[i] = ti; } } // Adjust to match fixslicing format #[cfg(aes_compact)] { for i in (8..104).step_by(16) { inv_shift_rows_1(&mut rkeys[i..(i + 8)]); } } #[cfg(not(aes_compact))] { for i in (0..96).step_by(32) { inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]); inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]); inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]); } } // Account for NOTs removed from sub_bytes for i in 1..13 { sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); } rkeys } /// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation. pub(crate) fn aes256_key_schedule(key: &[u8; 32]) -> FixsliceKeys256 { let mut rkeys = [0u32; 120]; bitslice(&mut rkeys[..8], &key[..16], &key[..16]); bitslice(&mut rkeys[8..16], &key[16..], &key[16..]); let mut rk_off = 8; let mut rcon = 0; loop { memshift32(&mut rkeys, rk_off); rk_off += 8; sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon); xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3)); rcon += 1; if rcon == 7 { break; } memshift32(&mut rkeys, rk_off); rk_off += 8; sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]); sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]); xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3)); } // Adjust to match fixslicing format #[cfg(aes_compact)] { for i in (8..120).step_by(16) { inv_shift_rows_1(&mut rkeys[i..(i + 8)]); } } #[cfg(not(aes_compact))] { for i in (8..104).step_by(32) { inv_shift_rows_1(&mut rkeys[i..(i + 8)]); inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]); inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]); } inv_shift_rows_1(&mut rkeys[104..112]); } // Account for NOTs removed from sub_bytes for i in 1..15 { sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]); } rkeys } /// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted). /// /// Decrypts four blocks in-place and in parallel. pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks { let mut state = State::default(); bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[80..]); inv_sub_bytes(&mut state); #[cfg(not(aes_compact))] { inv_shift_rows_2(&mut state); } let mut rk_off = 72; loop { #[cfg(aes_compact)] { inv_shift_rows_2(&mut state); } add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_1(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; if rk_off == 0 { break; } add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_0(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; #[cfg(not(aes_compact))] { add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_3(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_2(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; } } add_round_key(&mut state, &rkeys[..8]); inv_bitslice(&state) } /// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted). /// /// Encrypts four blocks in-place and in parallel. pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &BatchBlocks) -> BatchBlocks { let mut state = State::default(); bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[..8]); let mut rk_off = 8; loop { sub_bytes(&mut state); mix_columns_1(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; #[cfg(aes_compact)] { shift_rows_2(&mut state); } if rk_off == 80 { break; } #[cfg(not(aes_compact))] { sub_bytes(&mut state); mix_columns_2(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; sub_bytes(&mut state); mix_columns_3(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; } sub_bytes(&mut state); mix_columns_0(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; } #[cfg(not(aes_compact))] { shift_rows_2(&mut state); } sub_bytes(&mut state); add_round_key(&mut state, &rkeys[80..]); inv_bitslice(&state) } /// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted). /// /// Decrypts four blocks in-place and in parallel. pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks { let mut state = State::default(); bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[96..]); inv_sub_bytes(&mut state); let mut rk_off = 88; loop { #[cfg(aes_compact)] { inv_shift_rows_2(&mut state); } #[cfg(not(aes_compact))] { add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_3(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_2(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; } add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_1(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; if rk_off == 0 { break; } add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_0(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; } add_round_key(&mut state, &rkeys[..8]); inv_bitslice(&state) } /// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted). /// /// Encrypts four blocks in-place and in parallel. pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &BatchBlocks) -> BatchBlocks { let mut state = State::default(); bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[..8]); let mut rk_off = 8; loop { sub_bytes(&mut state); mix_columns_1(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; #[cfg(aes_compact)] { shift_rows_2(&mut state); } #[cfg(not(aes_compact))] { sub_bytes(&mut state); mix_columns_2(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; sub_bytes(&mut state); mix_columns_3(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; } if rk_off == 96 { break; } sub_bytes(&mut state); mix_columns_0(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; } sub_bytes(&mut state); add_round_key(&mut state, &rkeys[96..]); inv_bitslice(&state) } /// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted). /// /// Decrypts four blocks in-place and in parallel. pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks { let mut state = State::default(); bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[112..]); inv_sub_bytes(&mut state); #[cfg(not(aes_compact))] { inv_shift_rows_2(&mut state); } let mut rk_off = 104; loop { #[cfg(aes_compact)] { inv_shift_rows_2(&mut state); } add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_1(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; if rk_off == 0 { break; } add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_0(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; #[cfg(not(aes_compact))] { add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_3(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); inv_mix_columns_2(&mut state); inv_sub_bytes(&mut state); rk_off -= 8; } } add_round_key(&mut state, &rkeys[..8]); inv_bitslice(&state) } /// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted). /// /// Encrypts four blocks in-place and in parallel. pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &BatchBlocks) -> BatchBlocks { let mut state = State::default(); bitslice(&mut state, &blocks[0], &blocks[1]); add_round_key(&mut state, &rkeys[..8]); let mut rk_off = 8; loop { sub_bytes(&mut state); mix_columns_1(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; #[cfg(aes_compact)] { shift_rows_2(&mut state); } if rk_off == 112 { break; } #[cfg(not(aes_compact))] { sub_bytes(&mut state); mix_columns_2(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; sub_bytes(&mut state); mix_columns_3(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; } sub_bytes(&mut state); mix_columns_0(&mut state); add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]); rk_off += 8; } #[cfg(not(aes_compact))] { shift_rows_2(&mut state); } sub_bytes(&mut state); add_round_key(&mut state, &rkeys[112..]); inv_bitslice(&state) } /// Note that the 4 bitwise NOT (^= 0xffffffff) are accounted for here so that it is a true /// inverse of 'sub_bytes'. fn inv_sub_bytes(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) let u7 = state[0]; let u6 = state[1]; let u5 = state[2]; let u4 = state[3]; let u3 = state[4]; let u2 = state[5]; let u1 = state[6]; let u0 = state[7]; let t23 = u0 ^ u3; let t8 = u1 ^ t23; let m2 = t23 & t8; let t4 = u4 ^ t8; let t22 = u1 ^ u3; let t2 = u0 ^ u1; let t1 = u3 ^ u4; // t23 -> stack let t9 = u7 ^ t1; // t8 -> stack let m7 = t22 & t9; // t9 -> stack let t24 = u4 ^ u7; // m7 -> stack let t10 = t2 ^ t24; // u4 -> stack let m14 = t2 & t10; let r5 = u6 ^ u7; // m2 -> stack let t3 = t1 ^ r5; // t2 -> stack let t13 = t2 ^ r5; let t19 = t22 ^ r5; // t3 -> stack let t17 = u2 ^ t19; // t4 -> stack let t25 = u2 ^ t1; let r13 = u1 ^ u6; // t25 -> stack let t20 = t24 ^ r13; // t17 -> stack let m9 = t20 & t17; // t20 -> stack let r17 = u2 ^ u5; // t22 -> stack let t6 = t22 ^ r17; // t13 -> stack let m1 = t13 & t6; let y5 = u0 ^ r17; let m4 = t19 & y5; let m5 = m4 ^ m1; let m17 = m5 ^ t24; let r18 = u5 ^ u6; let t27 = t1 ^ r18; let t15 = t10 ^ t27; // t6 -> stack let m11 = t1 & t15; let m15 = m14 ^ m11; let m21 = m17 ^ m15; // t1 -> stack // t4 <- stack let m12 = t4 & t27; let m13 = m12 ^ m11; let t14 = t10 ^ r18; let m3 = t14 ^ m1; // m2 <- stack let m16 = m3 ^ m2; let m20 = m16 ^ m13; // u4 <- stack let r19 = u2 ^ u4; let t16 = r13 ^ r19; // t3 <- stack let t26 = t3 ^ t16; let m6 = t3 & t16; let m8 = t26 ^ m6; // t10 -> stack // m7 <- stack let m18 = m8 ^ m7; let m22 = m18 ^ m13; let m25 = m22 & m20; let m26 = m21 ^ m25; let m10 = m9 ^ m6; let m19 = m10 ^ m15; // t25 <- stack let m23 = m19 ^ t25; let m28 = m23 ^ m25; let m24 = m22 ^ m23; let m30 = m26 & m24; let m39 = m23 ^ m30; let m48 = m39 & y5; let m57 = m39 & t19; // m48 -> stack let m36 = m24 ^ m25; let m31 = m20 & m23; let m27 = m20 ^ m21; let m32 = m27 & m31; let m29 = m28 & m27; let m37 = m21 ^ m29; // m39 -> stack let m42 = m37 ^ m39; let m52 = m42 & t15; // t27 -> stack // t1 <- stack let m61 = m42 & t1; let p0 = m52 ^ m61; let p16 = m57 ^ m61; // m57 -> stack // t20 <- stack let m60 = m37 & t20; // p16 -> stack // t17 <- stack let m51 = m37 & t17; let m33 = m27 ^ m25; let m38 = m32 ^ m33; let m43 = m37 ^ m38; let m49 = m43 & t16; let p6 = m49 ^ m60; let p13 = m49 ^ m51; let m58 = m43 & t3; // t9 <- stack let m50 = m38 & t9; // t22 <- stack let m59 = m38 & t22; // p6 -> stack let p1 = m58 ^ m59; let p7 = p0 ^ p1; let m34 = m21 & m22; let m35 = m24 & m34; let m40 = m35 ^ m36; let m41 = m38 ^ m40; let m45 = m42 ^ m41; // t27 <- stack let m53 = m45 & t27; let p8 = m50 ^ m53; let p23 = p7 ^ p8; // t4 <- stack let m62 = m45 & t4; let p14 = m49 ^ m62; let s6 = p14 ^ p23; // t10 <- stack let m54 = m41 & t10; let p2 = m54 ^ m62; let p22 = p2 ^ p7; let s0 = p13 ^ p22; let p17 = m58 ^ p2; let p15 = m54 ^ m59; // t2 <- stack let m63 = m41 & t2; // m39 <- stack let m44 = m39 ^ m40; // p17 -> stack // t6 <- stack let m46 = m44 & t6; let p5 = m46 ^ m51; // p23 -> stack let p18 = m63 ^ p5; let p24 = p5 ^ p7; // m48 <- stack let p12 = m46 ^ m48; let s3 = p12 ^ p22; // t13 <- stack let m55 = m44 & t13; let p9 = m55 ^ m63; // p16 <- stack let s7 = p9 ^ p16; // t8 <- stack let m47 = m40 & t8; let p3 = m47 ^ m50; let p19 = p2 ^ p3; let s5 = p19 ^ p24; let p11 = p0 ^ p3; let p26 = p9 ^ p11; // t23 <- stack let m56 = m40 & t23; let p4 = m48 ^ m56; // p6 <- stack let p20 = p4 ^ p6; let p29 = p15 ^ p20; let s1 = p26 ^ p29; // m57 <- stack let p10 = m57 ^ p4; let p27 = p10 ^ p18; // p23 <- stack let s4 = p23 ^ p27; let p25 = p6 ^ p10; let p28 = p11 ^ p25; // p17 <- stack let s2 = p17 ^ p28; state[0] = s7; state[1] = s6; state[2] = s5; state[3] = s4; state[4] = s3; state[5] = s2; state[6] = s1; state[7] = s0; } /// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik. /// /// See: /// /// Note that the 4 bitwise NOT (^= 0xffffffff) are moved to the key schedule. fn sub_bytes(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4) let u7 = state[0]; let u6 = state[1]; let u5 = state[2]; let u4 = state[3]; let u3 = state[4]; let u2 = state[5]; let u1 = state[6]; let u0 = state[7]; let y14 = u3 ^ u5; let y13 = u0 ^ u6; let y12 = y13 ^ y14; let t1 = u4 ^ y12; let y15 = t1 ^ u5; let t2 = y12 & y15; let y6 = y15 ^ u7; let y20 = t1 ^ u1; // y12 -> stack let y9 = u0 ^ u3; // y20 -> stack let y11 = y20 ^ y9; // y9 -> stack let t12 = y9 & y11; // y6 -> stack let y7 = u7 ^ y11; let y8 = u0 ^ u5; let t0 = u1 ^ u2; let y10 = y15 ^ t0; // y15 -> stack let y17 = y10 ^ y11; // y14 -> stack let t13 = y14 & y17; let t14 = t13 ^ t12; // y17 -> stack let y19 = y10 ^ y8; // y10 -> stack let t15 = y8 & y10; let t16 = t15 ^ t12; let y16 = t0 ^ y11; // y11 -> stack let y21 = y13 ^ y16; // y13 -> stack let t7 = y13 & y16; // y16 -> stack let y18 = u0 ^ y16; let y1 = t0 ^ u7; let y4 = y1 ^ u3; // u7 -> stack let t5 = y4 & u7; let t6 = t5 ^ t2; let t18 = t6 ^ t16; let t22 = t18 ^ y19; let y2 = y1 ^ u0; let t10 = y2 & y7; let t11 = t10 ^ t7; let t20 = t11 ^ t16; let t24 = t20 ^ y18; let y5 = y1 ^ u6; let t8 = y5 & y1; let t9 = t8 ^ t7; let t19 = t9 ^ t14; let t23 = t19 ^ y21; let y3 = y5 ^ y8; // y6 <- stack let t3 = y3 & y6; let t4 = t3 ^ t2; // y20 <- stack let t17 = t4 ^ y20; let t21 = t17 ^ t14; let t26 = t21 & t23; let t27 = t24 ^ t26; let t31 = t22 ^ t26; let t25 = t21 ^ t22; // y4 -> stack let t28 = t25 & t27; let t29 = t28 ^ t22; let z14 = t29 & y2; let z5 = t29 & y7; let t30 = t23 ^ t24; let t32 = t31 & t30; let t33 = t32 ^ t24; let t35 = t27 ^ t33; let t36 = t24 & t35; let t38 = t27 ^ t36; let t39 = t29 & t38; let t40 = t25 ^ t39; let t43 = t29 ^ t40; // y16 <- stack let z3 = t43 & y16; let tc12 = z3 ^ z5; // tc12 -> stack // y13 <- stack let z12 = t43 & y13; let z13 = t40 & y5; let z4 = t40 & y1; let tc6 = z3 ^ z4; let t34 = t23 ^ t33; let t37 = t36 ^ t34; let t41 = t40 ^ t37; // y10 <- stack let z8 = t41 & y10; let z17 = t41 & y8; let t44 = t33 ^ t37; // y15 <- stack let z0 = t44 & y15; // z17 -> stack // y12 <- stack let z9 = t44 & y12; let z10 = t37 & y3; let z1 = t37 & y6; let tc5 = z1 ^ z0; let tc11 = tc6 ^ tc5; // y4 <- stack let z11 = t33 & y4; let t42 = t29 ^ t33; let t45 = t42 ^ t41; // y17 <- stack let z7 = t45 & y17; let tc8 = z7 ^ tc6; // y14 <- stack let z16 = t45 & y14; // y11 <- stack let z6 = t42 & y11; let tc16 = z6 ^ tc8; // z14 -> stack // y9 <- stack let z15 = t42 & y9; let tc20 = z15 ^ tc16; let tc1 = z15 ^ z16; let tc2 = z10 ^ tc1; let tc21 = tc2 ^ z11; let tc3 = z9 ^ tc2; let s0 = tc3 ^ tc16; let s3 = tc3 ^ tc11; let s1 = s3 ^ tc16; let tc13 = z13 ^ tc1; // u7 <- stack let z2 = t33 & u7; let tc4 = z0 ^ z2; let tc7 = z12 ^ tc4; let tc9 = z8 ^ tc7; let tc10 = tc8 ^ tc9; // z14 <- stack let tc17 = z14 ^ tc10; let s5 = tc21 ^ tc17; let tc26 = tc17 ^ tc20; // z17 <- stack let s2 = tc26 ^ z17; // tc12 <- stack let tc14 = tc4 ^ tc12; let tc18 = tc13 ^ tc14; let s6 = tc10 ^ tc18; let s7 = z12 ^ tc18; let s4 = tc14 ^ s3; state[0] = s7; state[1] = s6; state[2] = s5; state[3] = s4; state[4] = s3; state[5] = s2; state[6] = s1; state[7] = s0; } /// NOT operations that are omitted in S-box #[inline] fn sub_bytes_nots(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); state[0] ^= 0xffffffff; state[1] ^= 0xffffffff; state[5] ^= 0xffffffff; state[6] ^= 0xffffffff; } /// Computation of the MixColumns transformation in the fixsliced representation, with different /// rotations used according to the round number mod 4. /// /// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm. macro_rules! define_mix_columns { ( $name:ident, $name_inv:ident, $first_rotate:path, $second_rotate:path ) => { #[rustfmt::skip] fn $name(state: &mut State) { let (a0, a1, a2, a3, a4, a5, a6, a7) = ( state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] ); let (b0, b1, b2, b3, b4, b5, b6, b7) = ( $first_rotate(a0), $first_rotate(a1), $first_rotate(a2), $first_rotate(a3), $first_rotate(a4), $first_rotate(a5), $first_rotate(a6), $first_rotate(a7), ); let (c0, c1, c2, c3, c4, c5, c6, c7) = ( a0 ^ b0, a1 ^ b1, a2 ^ b2, a3 ^ b3, a4 ^ b4, a5 ^ b5, a6 ^ b6, a7 ^ b7, ); state[0] = b0 ^ c7 ^ $second_rotate(c0); state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1); state[2] = b2 ^ c1 ^ $second_rotate(c2); state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3); state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4); state[5] = b5 ^ c4 ^ $second_rotate(c5); state[6] = b6 ^ c5 ^ $second_rotate(c6); state[7] = b7 ^ c6 ^ $second_rotate(c7); } #[rustfmt::skip] fn $name_inv(state: &mut State) { let (a0, a1, a2, a3, a4, a5, a6, a7) = ( state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7] ); let (b0, b1, b2, b3, b4, b5, b6, b7) = ( $first_rotate(a0), $first_rotate(a1), $first_rotate(a2), $first_rotate(a3), $first_rotate(a4), $first_rotate(a5), $first_rotate(a6), $first_rotate(a7), ); let (c0, c1, c2, c3, c4, c5, c6, c7) = ( a0 ^ b0, a1 ^ b1, a2 ^ b2, a3 ^ b3, a4 ^ b4, a5 ^ b5, a6 ^ b6, a7 ^ b7, ); let (d0, d1, d2, d3, d4, d5, d6, d7) = ( a0 ^ c7, a1 ^ c0 ^ c7, a2 ^ c1, a3 ^ c2 ^ c7, a4 ^ c3 ^ c7, a5 ^ c4, a6 ^ c5, a7 ^ c6, ); let (e0, e1, e2, e3, e4, e5, e6, e7) = ( c0 ^ d6, c1 ^ d6 ^ d7, c2 ^ d0 ^ d7, c3 ^ d1 ^ d6, c4 ^ d2 ^ d6 ^ d7, c5 ^ d3 ^ d7, c6 ^ d4, c7 ^ d5, ); state[0] = d0 ^ e0 ^ $second_rotate(e0); state[1] = d1 ^ e1 ^ $second_rotate(e1); state[2] = d2 ^ e2 ^ $second_rotate(e2); state[3] = d3 ^ e3 ^ $second_rotate(e3); state[4] = d4 ^ e4 ^ $second_rotate(e4); state[5] = d5 ^ e5 ^ $second_rotate(e5); state[6] = d6 ^ e6 ^ $second_rotate(e6); state[7] = d7 ^ e7 ^ $second_rotate(e7); } } } define_mix_columns!( mix_columns_0, inv_mix_columns_0, rotate_rows_1, rotate_rows_2 ); define_mix_columns!( mix_columns_1, inv_mix_columns_1, rotate_rows_and_columns_1_1, rotate_rows_and_columns_2_2 ); #[cfg(not(aes_compact))] define_mix_columns!( mix_columns_2, inv_mix_columns_2, rotate_rows_and_columns_1_2, rotate_rows_2 ); #[cfg(not(aes_compact))] define_mix_columns!( mix_columns_3, inv_mix_columns_3, rotate_rows_and_columns_1_3, rotate_rows_and_columns_2_2 ); #[inline] fn delta_swap_1(a: &mut u32, shift: u32, mask: u32) { let t = (*a ^ ((*a) >> shift)) & mask; *a ^= t ^ (t << shift); } #[inline] fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) { let t = (*a ^ ((*b) >> shift)) & mask; *a ^= t; *b ^= t << shift; } /// Applies ShiftRows once on an AES state (or key). #[cfg(any(not(aes_compact), feature = "hazmat"))] #[inline] fn shift_rows_1(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { delta_swap_1(x, 4, 0x0c0f0300); delta_swap_1(x, 2, 0x33003300); } } /// Applies ShiftRows twice on an AES state (or key). #[inline] fn shift_rows_2(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { delta_swap_1(x, 4, 0x0f000f00); } } /// Applies ShiftRows three times on an AES state (or key). #[inline] fn shift_rows_3(state: &mut [u32]) { debug_assert_eq!(state.len(), 8); for x in state.iter_mut() { delta_swap_1(x, 4, 0x030f0c00); delta_swap_1(x, 2, 0x33003300); } } #[inline(always)] fn inv_shift_rows_1(state: &mut [u32]) { shift_rows_3(state); } #[inline(always)] fn inv_shift_rows_2(state: &mut [u32]) { shift_rows_2(state); } #[cfg(not(aes_compact))] #[inline(always)] fn inv_shift_rows_3(state: &mut [u32]) { shift_rows_1(state); } /// XOR the columns after the S-box during the key schedule round function. /// /// The `idx_xor` parameter refers to the index of the previous round key that is /// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256, /// respectively). /// /// The `idx_ror` parameter refers to the rotation value, which varies between the /// different key schedules. fn xor_columns(rkeys: &mut [u32], offset: usize, idx_xor: usize, idx_ror: u32) { for i in 0..8 { let off_i = offset + i; let rk = rkeys[off_i - idx_xor] ^ (0x03030303 & ror(rkeys[off_i], idx_ror)); rkeys[off_i] = rk ^ (0xfcfcfcfc & (rk << 2)) ^ (0xf0f0f0f0 & (rk << 4)) ^ (0xc0c0c0c0 & (rk << 6)); } } /// Bitslice two 128-bit input blocks input0, input1 into a 256-bit internal state. fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) { debug_assert_eq!(output.len(), 8); debug_assert_eq!(input0.len(), 16); debug_assert_eq!(input1.len(), 16); // Bitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at an // 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition): // b0 c1 c0 r1 r0 p2 p1 p0 // // The desired bitsliced data groups first by bit position, then row, column, block: // p2 p1 p0 r1 r0 c1 c0 b0 // Interleave the columns on input (note the order of input) // b0 c1 c0 __ __ __ __ __ => c1 c0 b0 __ __ __ __ __ let mut t0 = u32::from_le_bytes(input0[0x00..0x04].try_into().unwrap()); let mut t2 = u32::from_le_bytes(input0[0x04..0x08].try_into().unwrap()); let mut t4 = u32::from_le_bytes(input0[0x08..0x0c].try_into().unwrap()); let mut t6 = u32::from_le_bytes(input0[0x0c..0x10].try_into().unwrap()); let mut t1 = u32::from_le_bytes(input1[0x00..0x04].try_into().unwrap()); let mut t3 = u32::from_le_bytes(input1[0x04..0x08].try_into().unwrap()); let mut t5 = u32::from_le_bytes(input1[0x08..0x0c].try_into().unwrap()); let mut t7 = u32::from_le_bytes(input1[0x0c..0x10].try_into().unwrap()); // Bit Index Swap 5 <-> 0: // __ __ b0 __ __ __ __ p0 => __ __ p0 __ __ __ __ b0 let m0 = 0x55555555; delta_swap_2(&mut t1, &mut t0, 1, m0); delta_swap_2(&mut t3, &mut t2, 1, m0); delta_swap_2(&mut t5, &mut t4, 1, m0); delta_swap_2(&mut t7, &mut t6, 1, m0); // Bit Index Swap 6 <-> 1: // __ c0 __ __ __ __ p1 __ => __ p1 __ __ __ __ c0 __ let m1 = 0x33333333; delta_swap_2(&mut t2, &mut t0, 2, m1); delta_swap_2(&mut t3, &mut t1, 2, m1); delta_swap_2(&mut t6, &mut t4, 2, m1); delta_swap_2(&mut t7, &mut t5, 2, m1); // Bit Index Swap 7 <-> 2: // c1 __ __ __ __ p2 __ __ => p2 __ __ __ __ c1 __ __ let m2 = 0x0f0f0f0f; delta_swap_2(&mut t4, &mut t0, 4, m2); delta_swap_2(&mut t5, &mut t1, 4, m2); delta_swap_2(&mut t6, &mut t2, 4, m2); delta_swap_2(&mut t7, &mut t3, 4, m2); // Final bitsliced bit index, as desired: // p2 p1 p0 r1 r0 c1 c0 b0 output[0] = t0; output[1] = t1; output[2] = t2; output[3] = t3; output[4] = t4; output[5] = t5; output[6] = t6; output[7] = t7; } /// Un-bitslice a 256-bit internal state into two 128-bit blocks of output. fn inv_bitslice(input: &[u32]) -> BatchBlocks { debug_assert_eq!(input.len(), 8); // Unbitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at // an 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition): // b0 c1 c0 r1 r0 p2 p1 p0 // // The initially bitsliced data groups first by bit position, then row, column, block: // p2 p1 p0 r1 r0 c1 c0 b0 let mut t0 = input[0]; let mut t1 = input[1]; let mut t2 = input[2]; let mut t3 = input[3]; let mut t4 = input[4]; let mut t5 = input[5]; let mut t6 = input[6]; let mut t7 = input[7]; // TODO: these bit index swaps are identical to those in 'packing' // Bit Index Swap 5 <-> 0: // __ __ p0 __ __ __ __ b0 => __ __ b0 __ __ __ __ p0 let m0 = 0x55555555; delta_swap_2(&mut t1, &mut t0, 1, m0); delta_swap_2(&mut t3, &mut t2, 1, m0); delta_swap_2(&mut t5, &mut t4, 1, m0); delta_swap_2(&mut t7, &mut t6, 1, m0); // Bit Index Swap 6 <-> 1: // __ p1 __ __ __ __ c0 __ => __ c0 __ __ __ __ p1 __ let m1 = 0x33333333; delta_swap_2(&mut t2, &mut t0, 2, m1); delta_swap_2(&mut t3, &mut t1, 2, m1); delta_swap_2(&mut t6, &mut t4, 2, m1); delta_swap_2(&mut t7, &mut t5, 2, m1); // Bit Index Swap 7 <-> 2: // p2 __ __ __ __ c1 __ __ => c1 __ __ __ __ p2 __ __ let m2 = 0x0f0f0f0f; delta_swap_2(&mut t4, &mut t0, 4, m2); delta_swap_2(&mut t5, &mut t1, 4, m2); delta_swap_2(&mut t6, &mut t2, 4, m2); delta_swap_2(&mut t7, &mut t3, 4, m2); let mut output = BatchBlocks::default(); // De-interleave the columns on output (note the order of output) // c1 c0 b0 __ __ __ __ __ => b0 c1 c0 __ __ __ __ __ output[0][0x00..0x04].copy_from_slice(&t0.to_le_bytes()); output[0][0x04..0x08].copy_from_slice(&t2.to_le_bytes()); output[0][0x08..0x0c].copy_from_slice(&t4.to_le_bytes()); output[0][0x0c..0x10].copy_from_slice(&t6.to_le_bytes()); output[1][0x00..0x04].copy_from_slice(&t1.to_le_bytes()); output[1][0x04..0x08].copy_from_slice(&t3.to_le_bytes()); output[1][0x08..0x0c].copy_from_slice(&t5.to_le_bytes()); output[1][0x0c..0x10].copy_from_slice(&t7.to_le_bytes()); // Final AES bit index, as desired: // b0 c1 c0 r1 r0 p2 p1 p0 output } /// Copy 32-bytes within the provided slice to an 8-byte offset fn memshift32(buffer: &mut [u32], src_offset: usize) { debug_assert_eq!(src_offset % 8, 0); let dst_offset = src_offset + 8; debug_assert!(dst_offset + 8 <= buffer.len()); for i in (0..8).rev() { buffer[dst_offset + i] = buffer[src_offset + i]; } } /// XOR the round key to the internal state. The round keys are expected to be /// pre-computed and to be packed in the fixsliced representation. #[inline] fn add_round_key(state: &mut State, rkey: &[u32]) { debug_assert_eq!(rkey.len(), 8); for (a, b) in state.iter_mut().zip(rkey) { *a ^= b; } } #[inline(always)] fn add_round_constant_bit(state: &mut [u32], bit: usize) { state[bit] ^= 0x0000c000; } #[inline(always)] fn ror(x: u32, y: u32) -> u32 { x.rotate_right(y) } #[inline(always)] fn ror_distance(rows: u32, cols: u32) -> u32 { (rows << 3) + (cols << 1) } #[inline(always)] fn rotate_rows_1(x: u32) -> u32 { ror(x, ror_distance(1, 0)) } #[inline(always)] fn rotate_rows_2(x: u32) -> u32 { ror(x, ror_distance(2, 0)) } #[inline(always)] #[rustfmt::skip] fn rotate_rows_and_columns_1_1(x: u32) -> u32 { (ror(x, ror_distance(1, 1)) & 0x3f3f3f3f) | (ror(x, ror_distance(0, 1)) & 0xc0c0c0c0) } #[cfg(not(aes_compact))] #[inline(always)] #[rustfmt::skip] fn rotate_rows_and_columns_1_2(x: u32) -> u32 { (ror(x, ror_distance(1, 2)) & 0x0f0f0f0f) | (ror(x, ror_distance(0, 2)) & 0xf0f0f0f0) } #[cfg(not(aes_compact))] #[inline(always)] #[rustfmt::skip] fn rotate_rows_and_columns_1_3(x: u32) -> u32 { (ror(x, ror_distance(1, 3)) & 0x03030303) | (ror(x, ror_distance(0, 3)) & 0xfcfcfcfc) } #[inline(always)] #[rustfmt::skip] fn rotate_rows_and_columns_2_2(x: u32) -> u32 { (ror(x, ror_distance(2, 2)) & 0x0f0f0f0f) | (ror(x, ror_distance(1, 2)) & 0xf0f0f0f0) } /// Low-level "hazmat" AES functions. /// /// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256` /// implementations in this crate, but instead provides raw access to /// the AES round function gated under the `hazmat` crate feature. #[cfg(feature = "hazmat")] pub(crate) mod hazmat { use super::{ bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0, shift_rows_1, sub_bytes, sub_bytes_nots, State, }; use crate::{Block, Block8}; /// XOR the `src` block into the `dst` block in-place. fn xor_in_place(dst: &mut Block, src: &Block) { for (a, b) in dst.iter_mut().zip(src.as_slice()) { *a ^= *b; } } /// Perform a bitslice operation, loading a single block. fn bitslice_block(block: &Block) -> State { let mut state = State::default(); bitslice(&mut state, block, block); state } /// Perform an inverse bitslice operation, extracting a single block. fn inv_bitslice_block(block: &mut Block, state: &State) { let out = inv_bitslice(state); block.copy_from_slice(&out[0]); } /// AES cipher (encrypt) round function. #[inline] pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) { let mut state = bitslice_block(block); sub_bytes(&mut state); sub_bytes_nots(&mut state); shift_rows_1(&mut state); mix_columns_0(&mut state); inv_bitslice_block(block, &state); xor_in_place(block, round_key); } /// AES cipher (encrypt) round function: parallel version. #[inline] pub(crate) fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) { for (chunk, keys) in blocks.chunks_exact_mut(2).zip(round_keys.chunks_exact(2)) { let mut state = State::default(); bitslice(&mut state, &chunk[0], &chunk[1]); sub_bytes(&mut state); sub_bytes_nots(&mut state); shift_rows_1(&mut state); mix_columns_0(&mut state); let res = inv_bitslice(&state); for i in 0..2 { chunk[i] = res[i]; xor_in_place(&mut chunk[i], &keys[i]); } } } /// AES cipher (encrypt) round function. #[inline] pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) { let mut state = bitslice_block(block); sub_bytes_nots(&mut state); inv_sub_bytes(&mut state); inv_shift_rows_1(&mut state); inv_mix_columns_0(&mut state); inv_bitslice_block(block, &state); xor_in_place(block, round_key); } /// AES cipher (encrypt) round function: parallel version. #[inline] pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: &Block8) { for (chunk, keys) in blocks.chunks_exact_mut(2).zip(round_keys.chunks_exact(2)) { let mut state = State::default(); bitslice(&mut state, &chunk[0], &chunk[1]); sub_bytes_nots(&mut state); inv_sub_bytes(&mut state); inv_shift_rows_1(&mut state); inv_mix_columns_0(&mut state); let res = inv_bitslice(&state); for i in 0..2 { chunk[i] = res[i]; xor_in_place(&mut chunk[i], &keys[i]); } } } /// AES mix columns function. #[inline] pub(crate) fn mix_columns(block: &mut Block) { let mut state = bitslice_block(block); mix_columns_0(&mut state); inv_bitslice_block(block, &state); } /// AES inverse mix columns function. #[inline] pub(crate) fn inv_mix_columns(block: &mut Block) { let mut state = bitslice_block(block); inv_mix_columns_0(&mut state); inv_bitslice_block(block, &state); } }