// Copyright 2016 The OpenSSL Project Authors. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 #include #include "internal.h" #include "../../internal.h" #include "../cpucap/internal.h" static const uint64_t iotas[] = { 0x0000000000000001ULL, 0x0000000000008082ULL, 0x800000000000808aULL, 0x8000000080008000ULL, 0x000000000000808bULL, 0x0000000080000001ULL, 0x8000000080008081ULL, 0x8000000000008009ULL, 0x000000000000008aULL, 0x0000000000000088ULL, 0x0000000080008009ULL, 0x000000008000000aULL, 0x000000008000808bULL, 0x800000000000008bULL, 0x8000000000008089ULL, 0x8000000000008003ULL, 0x8000000000008002ULL, 0x8000000000000080ULL, 0x000000000000800aULL, 0x800000008000000aULL, 0x8000000080008081ULL, 0x8000000000008080ULL, 0x0000000080000001ULL, 0x8000000080008008ULL }; #if !defined(KECCAK1600_ASM) static const uint8_t rhotates[KECCAK1600_ROWS][KECCAK1600_ROWS] = { { 0, 1, 62, 28, 27 }, { 36, 44, 6, 55, 20 }, { 3, 10, 43, 25, 39 }, { 41, 45, 15, 21, 8 }, { 18, 2, 61, 56, 14 } }; #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \ defined(__mips) || defined(__riscv) || defined(__s390__) || defined(__loongarch__) || \ defined(__EMSCRIPTEN__) // These platforms don't support "logical and with complement" instruction. # define KECCAK_COMPLEMENTING_TRANSFORM #endif static uint64_t ROL64(uint64_t val, int offset) { if (offset == 0) { return val; } else { return (val << offset) | (val >> (64-offset)); } } // KECCAK_2X: // This is the default implementation used in OpenSSL and the most efficient; // the other implementations were removed from this file. // This implementation is a variant of KECCAK_1X (see OpenSSL) // This implementation allows to take temporary storage // out of round procedure and simplify references to it by alternating // it with actual data (see round loop below). // It ensures best compiler interpretation to assembly and provides best // instruction per processed byte ratio at minimal round unroll factor. static void Round(uint64_t R[KECCAK1600_ROWS][KECCAK1600_ROWS], uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], size_t i) { uint64_t C[KECCAK1600_ROWS], D[KECCAK1600_ROWS]; assert(i < (sizeof(iotas) / sizeof(iotas[0]))); C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0]; C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1]; C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2]; C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3]; C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4]; D[0] = ROL64(C[1], 1) ^ C[4]; D[1] = ROL64(C[2], 1) ^ C[0]; D[2] = ROL64(C[3], 1) ^ C[1]; D[3] = ROL64(C[4], 1) ^ C[2]; D[4] = ROL64(C[0], 1) ^ C[3]; C[0] = A[0][0] ^ D[0]; C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]); C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]); C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]); C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]); #ifdef KECCAK_COMPLEMENTING_TRANSFORM R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]; R[0][1] = C[1] ^ (~C[2] | C[3]); R[0][2] = C[2] ^ ( C[3] & C[4]); R[0][3] = C[3] ^ ( C[4] | C[0]); R[0][4] = C[4] ^ ( C[0] & C[1]); #else R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i]; R[0][1] = C[1] ^ (~C[2] & C[3]); R[0][2] = C[2] ^ (~C[3] & C[4]); R[0][3] = C[3] ^ (~C[4] & C[0]); R[0][4] = C[4] ^ (~C[0] & C[1]); #endif C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]); C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]); C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]); C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]); C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]); #ifdef KECCAK_COMPLEMENTING_TRANSFORM R[1][0] = C[0] ^ (C[1] | C[2]); R[1][1] = C[1] ^ (C[2] & C[3]); R[1][2] = C[2] ^ (C[3] | ~C[4]); R[1][3] = C[3] ^ (C[4] | C[0]); R[1][4] = C[4] ^ (C[0] & C[1]); #else R[1][0] = C[0] ^ (~C[1] & C[2]); R[1][1] = C[1] ^ (~C[2] & C[3]); R[1][2] = C[2] ^ (~C[3] & C[4]); R[1][3] = C[3] ^ (~C[4] & C[0]); R[1][4] = C[4] ^ (~C[0] & C[1]); #endif C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]); C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]); C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]); C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]); C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]); #ifdef KECCAK_COMPLEMENTING_TRANSFORM R[2][0] = C[0] ^ ( C[1] | C[2]); R[2][1] = C[1] ^ ( C[2] & C[3]); R[2][2] = C[2] ^ (~C[3] & C[4]); R[2][3] = ~C[3] ^ ( C[4] | C[0]); R[2][4] = C[4] ^ ( C[0] & C[1]); #else R[2][0] = C[0] ^ (~C[1] & C[2]); R[2][1] = C[1] ^ (~C[2] & C[3]); R[2][2] = C[2] ^ (~C[3] & C[4]); R[2][3] = C[3] ^ (~C[4] & C[0]); R[2][4] = C[4] ^ (~C[0] & C[1]); #endif C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]); C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]); C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]); C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]); C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]); #ifdef KECCAK_COMPLEMENTING_TRANSFORM R[3][0] = C[0] ^ ( C[1] & C[2]); R[3][1] = C[1] ^ ( C[2] | C[3]); R[3][2] = C[2] ^ (~C[3] | C[4]); R[3][3] = ~C[3] ^ ( C[4] & C[0]); R[3][4] = C[4] ^ ( C[0] | C[1]); #else R[3][0] = C[0] ^ (~C[1] & C[2]); R[3][1] = C[1] ^ (~C[2] & C[3]); R[3][2] = C[2] ^ (~C[3] & C[4]); R[3][3] = C[3] ^ (~C[4] & C[0]); R[3][4] = C[4] ^ (~C[0] & C[1]); #endif C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]); C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]); C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]); C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]); C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]); #ifdef KECCAK_COMPLEMENTING_TRANSFORM R[4][0] = C[0] ^ (~C[1] & C[2]); R[4][1] = ~C[1] ^ ( C[2] | C[3]); R[4][2] = C[2] ^ ( C[3] & C[4]); R[4][3] = C[3] ^ ( C[4] | C[0]); R[4][4] = C[4] ^ ( C[0] & C[1]); #else R[4][0] = C[0] ^ (~C[1] & C[2]); R[4][1] = C[1] ^ (~C[2] & C[3]); R[4][2] = C[2] ^ (~C[3] & C[4]); R[4][3] = C[3] ^ (~C[4] & C[0]); R[4][4] = C[4] ^ (~C[0] & C[1]); #endif } static void KeccakF1600_c(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) { uint64_t T[KECCAK1600_ROWS][KECCAK1600_ROWS]; size_t i; #ifdef KECCAK_COMPLEMENTING_TRANSFORM A[0][1] = ~A[0][1]; A[0][2] = ~A[0][2]; A[1][3] = ~A[1][3]; A[2][2] = ~A[2][2]; A[3][2] = ~A[3][2]; A[4][0] = ~A[4][0]; #endif for (i = 0; i < 24; i += 2) { Round(T, A, i); Round(A, T, i + 1); } #ifdef KECCAK_COMPLEMENTING_TRANSFORM A[0][1] = ~A[0][1]; A[0][2] = ~A[0][2]; A[1][3] = ~A[1][3]; A[2][2] = ~A[2][2]; A[3][2] = ~A[3][2]; A[4][0] = ~A[4][0]; #endif } #endif // !KECCAK1600_ASM // Forward declaration for KeccakF1600 function void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]); // Keccak1600_Absorb can be called multiple times; at each invocation the // largest multiple of |r| out of |len| bytes are processed. The // remaining amount of bytes is returned. This is done to spare caller // trouble of calculating the largest multiple of |r|. |r| can be viewed // as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104, // 72, but can also be (1600 - 448)/8 = 144. All this means that message // padding and intermediate sub-block buffering, byte- or bitwise, is // caller's responsibility. // KeccakF1600_XORBytes XORs |len| bytes from |inp| into the Keccak state |A|. // |len| must be a multiple of 8. static void KeccakF1600_XORBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp, size_t len) { assert(len <= SHA3_MAX_BLOCKSIZE); assert((len % 8) == 0); uint64_t *A_flat = (uint64_t *)A; size_t w = len / 8; for (size_t i = 0; i < w; i++) { uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 | (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 | (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 | (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56; inp += 8; A_flat[i] ^= Ai; } } size_t Keccak1600_Absorb(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp, size_t len, size_t r) { assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0); while (len >= r) { KeccakF1600_XORBytes(A, inp, r); KeccakF1600(A); inp += r; len -= r; } return len; } // KeccakF1600_ExtractBytes extracts |len| bytes from the Keccak state |A| into |out|. // This function operates on up to block_size bytes (a single block). For extracting // more data, the state must be processed again through KeccakF1600 (see Keccak1600_Squeeze). static void KeccakF1600_ExtractBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out, size_t len) { uint64_t *A_flat = (uint64_t *)A; assert(len <= SHA3_MAX_BLOCKSIZE); size_t i = 0; while (len != 0) { uint64_t Ai = A_flat[i]; if (len < 8) { for (size_t j = 0; j < len; j++) { *out++ = (uint8_t)Ai; Ai >>= 8; } return; } out[0] = (uint8_t)(Ai); out[1] = (uint8_t)(Ai >> 8); out[2] = (uint8_t)(Ai >> 16); out[3] = (uint8_t)(Ai >> 24); out[4] = (uint8_t)(Ai >> 32); out[5] = (uint8_t)(Ai >> 40); out[6] = (uint8_t)(Ai >> 48); out[7] = (uint8_t)(Ai >> 56); out += 8; len -= 8; i++; } } void Keccak1600_Squeeze(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out, size_t len, size_t r, int padded) { assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0); while (len != 0) { if (padded) { KeccakF1600(A); } padded = 1; size_t extract_len = len < r ? len : r; KeccakF1600_ExtractBytes(A, out, extract_len); out += extract_len; len -= extract_len; } } #if defined(KECCAK1600_ASM) // Scalar implementation from OpenSSL provided by keccak1600-armv8.pl extern void KeccakF1600_hw(uint64_t state[25]); #if defined(OPENSSL_AARCH64) static void keccak_log_dispatch(size_t id) { #if BORINGSSL_DISPATCH_TEST BORINGSSL_function_hit[id] = 1; #endif } #endif void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) { // Dispatch logic for Keccak-x1 on AArch64: // // 1. If ASM is disabled, we use the C implementation. // 2. If ASM is enabled: // - For Neoverse N1, V1, V2, we use scalar Keccak assembly from s2n-bignum // (`sha3_keccak_f1600()`) // leveraging lazy rotations from https://eprint.iacr.org/2022/1243. // - Otherwise, if the Neon SHA3 extension is supported, we use the Neon // Keccak assembly from s2n-bignum (`sha3_keccak_f1600_alt()`), // leveraging that extension. // - Otherwise, fall back to scalar Keccak implementation from OpenSSL, // (`Keccak1600_hw()`), not using lazy rotations. // // Lazy rotations improve performance by up to 10% on CPUs with free // Barrel shifting, which includes Neoverse N1, V1, and V2. Not all // CPUs have free Barrel shifting (e.g. Apple M1 or Cortex-A72), so we // don't use it by default. // // Neoverse V1 and V2 do support SHA3 instructions, but they are only // implemented on 1/4 of Neon units, and are thus slower than a scalar // implementation. #if defined(OPENSSL_AARCH64) #if defined(KECCAK1600_S2N_BIGNUM_ASM) if (CRYPTO_is_Neoverse_N1() || CRYPTO_is_Neoverse_V1() || CRYPTO_is_Neoverse_V2()) { keccak_log_dispatch(10); // kFlag_sha3_keccak_f1600 sha3_keccak_f1600((uint64_t *)A, iotas); return; } #if defined(MY_ASSEMBLER_SUPPORTS_NEON_SHA3_EXTENSION) if (CRYPTO_is_ARMv8_SHA3_capable()) { keccak_log_dispatch(11); // kFlag_sha3_keccak_f1600_alt sha3_keccak_f1600_alt((uint64_t *)A, iotas); return; } #endif #endif keccak_log_dispatch(9); // kFlag_KeccakF1600_hw KeccakF1600_hw((uint64_t *) A); #elif defined(OPENSSL_X86_64) sha3_keccak_f1600((uint64_t *)A, iotas); #endif } #else // KECCAK1600_ASM void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) { KeccakF1600_c(A); } #endif // !KECCAK1600_ASM // KeccakF1600_XORBytes_x4 XORs |len| bytes from |inp0|, |inp1|, |inp2|, |inp3| // into the four Keccak states in |A|. |len| must be a multiple of 8. static void KeccakF1600_XORBytes_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp0, const uint8_t *inp1, const uint8_t *inp2, const uint8_t *inp3, size_t len) { KeccakF1600_XORBytes(A[0], inp0, len); KeccakF1600_XORBytes(A[1], inp1, len); KeccakF1600_XORBytes(A[2], inp2, len); KeccakF1600_XORBytes(A[3], inp3, len); } // KeccakF1600_ExtractBytes_x4 extracts |len| bytes from the four Keccak states in |A| // into |out0|, |out1|, |out2|, |out3|. static void KeccakF1600_ExtractBytes_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t len) { KeccakF1600_ExtractBytes(A[0], out0, len); KeccakF1600_ExtractBytes(A[1], out1, len); KeccakF1600_ExtractBytes(A[2], out2, len); KeccakF1600_ExtractBytes(A[3], out3, len); } static void Keccak1600_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS]) { // Dispatch logic for Keccak-x4 on AArch64: // // 1. If ASM is disabled, we use 4x the C implementation. // 2. If ASM is enabled: // - For Neoverse N1, we use scalar batched hybrid Keccak assembly from s2n-bignum // (`sha3_keccak4_f1600_alt()`) leveraging Neon and scalar assembly with // lazy rotations. // - For Neoverse V1, V2, we use SIMD batched hybrid Keccak assembly from s2n-bignum // (`sha3_keccak4_f1600_alt2()`) leveraging Neon, Neon SHA3 extension, // and scalar assembly with lazy rotations. // - Otherwise, if the Neon SHA3 extension is supported, we use the 2-fold // Keccak assembly from s2n-bignum (`sha3_keccak2_f1600()`) twice, // which is a straightforward implementation using the SHA3 extension. // - Otherwise, fall back to four times the 1-fold Keccak implementation // (which has its own dispatch logic). #if defined(KECCAK1600_S2N_BIGNUM_ASM) && defined(OPENSSL_AARCH64) if (CRYPTO_is_Neoverse_N1()) { keccak_log_dispatch(13); // kFlag_sha3_keccak4_f1600_alt sha3_keccak4_f1600_alt((uint64_t *)A, iotas); return; } #if defined(MY_ASSEMBLER_SUPPORTS_NEON_SHA3_EXTENSION) if (CRYPTO_is_Neoverse_V1() || CRYPTO_is_Neoverse_V2()) { keccak_log_dispatch(14); // kFlag_sha3_keccak4_f1600_alt2 sha3_keccak4_f1600_alt2((uint64_t *)A, iotas); return; } if (CRYPTO_is_ARMv8_SHA3_capable()) { keccak_log_dispatch(12); // kFlag_sha3_keccak2_f1600 // Use 2-fold function twice: A[0:1] and A[2:3] sha3_keccak2_f1600((uint64_t *)&A[0], iotas); sha3_keccak2_f1600((uint64_t *)&A[2], iotas); return; } #endif #endif // Fallback: 4x individual KeccakF1600 calls (each with their own dispatch) KeccakF1600(A[0]); KeccakF1600(A[1]); KeccakF1600(A[2]); KeccakF1600(A[3]); } // One-shot absorb + finalize. Note that in contract to non-batched Keccak, // this does _not_ run a Keccak permutation at the end, allowing for a uniform // implementation of Keccak1600_Squeezeblocks_x4() without `padded` parameter // as in the non-batched implementation. void Keccak1600_Absorb_once_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp0, const uint8_t *inp1, const uint8_t *inp2, const uint8_t *inp3, size_t len, size_t r, uint8_t p) { assert(r <= SHA3_MAX_BLOCKSIZE); while (len >= r) { KeccakF1600_XORBytes_x4(A, inp0, inp1, inp2, inp3, r); Keccak1600_x4(A); inp0 += r; inp1 += r; inp2 += r; inp3 += r; len -= r; } // Build 16-byte aligned final blocks for each input alignas(16) uint8_t final[4][SHA3_MAX_BLOCKSIZE] = {{0}}; // Copy the remainder bytes to final blocks OPENSSL_memcpy(final[0], inp0, len); OPENSSL_memcpy(final[1], inp1, len); OPENSSL_memcpy(final[2], inp2, len); OPENSSL_memcpy(final[3], inp3, len); if (len == r - 1) { p |= 128; } else { final[0][r - 1] |= 128; final[1][r - 1] |= 128; final[2][r - 1] |= 128; final[3][r - 1] |= 128; } final[0][len] |= p; final[1][len] |= p; final[2][len] |= p; final[3][len] |= p; KeccakF1600_XORBytes_x4(A, final[0], final[1], final[2], final[3], r); // Clean up final blocks to avoid stack leakage OPENSSL_cleanse(final, sizeof(final)); } void Keccak1600_Squeezeblocks_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out0, uint8_t *out1, uint8_t *out2, uint8_t *out3, size_t num_blocks, size_t r) { while (num_blocks != 0) { Keccak1600_x4(A); KeccakF1600_ExtractBytes_x4(A, out0, out1, out2, out3, r); out0 += r; out1 += r; out2 += r; out3 += r; num_blocks--; } }