// Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

#include <assert.h>
#include "internal.h"
#include "../../internal.h"
#include "../cpucap/internal.h"

static const uint64_t iotas[] = {
    0x0000000000000001ULL,
    0x0000000000008082ULL,
    0x800000000000808aULL,
    0x8000000080008000ULL,
    0x000000000000808bULL,
    0x0000000080000001ULL,
    0x8000000080008081ULL,
    0x8000000000008009ULL,
    0x000000000000008aULL,
    0x0000000000000088ULL,
    0x0000000080008009ULL,
    0x000000008000000aULL,
    0x000000008000808bULL,
    0x800000000000008bULL,
    0x8000000000008089ULL,
    0x8000000000008003ULL,
    0x8000000000008002ULL,
    0x8000000000000080ULL,
    0x000000000000800aULL,
    0x800000008000000aULL,
    0x8000000080008081ULL,
    0x8000000000008080ULL,
    0x0000000080000001ULL,
    0x8000000080008008ULL
};

#if !defined(KECCAK1600_ASM)

static const uint8_t rhotates[KECCAK1600_ROWS][KECCAK1600_ROWS] = {
    {  0,  1, 62, 28, 27 },
    { 36, 44,  6, 55, 20 },
    {  3, 10, 43, 25, 39 },
    { 41, 45, 15, 21,  8 },
    { 18,  2, 61, 56, 14 }
};

#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
    (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \
    defined(__mips) || defined(__riscv) || defined(__s390__) || defined(__loongarch__) || \
    defined(__EMSCRIPTEN__)

 // These platforms don't support "logical and with complement" instruction.
# define KECCAK_COMPLEMENTING_TRANSFORM
#endif

static uint64_t ROL64(uint64_t val, int offset) {
    if (offset == 0) {
        return val;
    } else {
        return (val << offset) | (val >> (64-offset));
    }
}

 // KECCAK_2X:
 // This is the default implementation used in OpenSSL and the most efficient;
 // the other implementations were removed from this file.
 // This implementation is a variant of KECCAK_1X (see OpenSSL)
 // This implementation allows to take temporary storage
 // out of round procedure and simplify references to it by alternating
 // it with actual data (see round loop below).
 // It ensures best compiler interpretation to assembly and provides best
 // instruction per processed byte ratio at minimal round unroll factor.
static void Round(uint64_t R[KECCAK1600_ROWS][KECCAK1600_ROWS], uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], size_t i) {
    uint64_t C[KECCAK1600_ROWS], D[KECCAK1600_ROWS];

    assert(i < (sizeof(iotas) / sizeof(iotas[0])));

    C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
    C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
    C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
    C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
    C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];

    D[0] = ROL64(C[1], 1) ^ C[4];
    D[1] = ROL64(C[2], 1) ^ C[0];
    D[2] = ROL64(C[3], 1) ^ C[1];
    D[3] = ROL64(C[4], 1) ^ C[2];
    D[4] = ROL64(C[0], 1) ^ C[3];

    C[0] =       A[0][0] ^ D[0];
    C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
    C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
    C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
    C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);

#ifdef KECCAK_COMPLEMENTING_TRANSFORM
    R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
    R[0][1] = C[1] ^ (~C[2] | C[3]);
    R[0][2] = C[2] ^ ( C[3] & C[4]);
    R[0][3] = C[3] ^ ( C[4] | C[0]);
    R[0][4] = C[4] ^ ( C[0] & C[1]);
#else
    R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
    R[0][1] = C[1] ^ (~C[2] & C[3]);
    R[0][2] = C[2] ^ (~C[3] & C[4]);
    R[0][3] = C[3] ^ (~C[4] & C[0]);
    R[0][4] = C[4] ^ (~C[0] & C[1]);
#endif

    C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
    C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
    C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
    C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
    C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);

#ifdef KECCAK_COMPLEMENTING_TRANSFORM
    R[1][0] = C[0] ^ (C[1] |  C[2]);
    R[1][1] = C[1] ^ (C[2] &  C[3]);
    R[1][2] = C[2] ^ (C[3] | ~C[4]);
    R[1][3] = C[3] ^ (C[4] |  C[0]);
    R[1][4] = C[4] ^ (C[0] &  C[1]);
#else
    R[1][0] = C[0] ^ (~C[1] & C[2]);
    R[1][1] = C[1] ^ (~C[2] & C[3]);
    R[1][2] = C[2] ^ (~C[3] & C[4]);
    R[1][3] = C[3] ^ (~C[4] & C[0]);
    R[1][4] = C[4] ^ (~C[0] & C[1]);
#endif

    C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
    C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
    C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
    C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
    C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);

#ifdef KECCAK_COMPLEMENTING_TRANSFORM
    R[2][0] =  C[0] ^ ( C[1] | C[2]);
    R[2][1] =  C[1] ^ ( C[2] & C[3]);
    R[2][2] =  C[2] ^ (~C[3] & C[4]);
    R[2][3] = ~C[3] ^ ( C[4] | C[0]);
    R[2][4] =  C[4] ^ ( C[0] & C[1]);
#else
    R[2][0] = C[0] ^ (~C[1] & C[2]);
    R[2][1] = C[1] ^ (~C[2] & C[3]);
    R[2][2] = C[2] ^ (~C[3] & C[4]);
    R[2][3] = C[3] ^ (~C[4] & C[0]);
    R[2][4] = C[4] ^ (~C[0] & C[1]);
#endif

    C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
    C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
    C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
    C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
    C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);

#ifdef KECCAK_COMPLEMENTING_TRANSFORM
    R[3][0] =  C[0] ^ ( C[1] & C[2]);
    R[3][1] =  C[1] ^ ( C[2] | C[3]);
    R[3][2] =  C[2] ^ (~C[3] | C[4]);
    R[3][3] = ~C[3] ^ ( C[4] & C[0]);
    R[3][4] =  C[4] ^ ( C[0] | C[1]);
#else
    R[3][0] = C[0] ^ (~C[1] & C[2]);
    R[3][1] = C[1] ^ (~C[2] & C[3]);
    R[3][2] = C[2] ^ (~C[3] & C[4]);
    R[3][3] = C[3] ^ (~C[4] & C[0]);
    R[3][4] = C[4] ^ (~C[0] & C[1]);
#endif

    C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
    C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
    C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
    C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
    C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);

#ifdef KECCAK_COMPLEMENTING_TRANSFORM
    R[4][0] =  C[0] ^ (~C[1] & C[2]);
    R[4][1] = ~C[1] ^ ( C[2] | C[3]);
    R[4][2] =  C[2] ^ ( C[3] & C[4]);
    R[4][3] =  C[3] ^ ( C[4] | C[0]);
    R[4][4] =  C[4] ^ ( C[0] & C[1]);
#else
    R[4][0] = C[0] ^ (~C[1] & C[2]);
    R[4][1] = C[1] ^ (~C[2] & C[3]);
    R[4][2] = C[2] ^ (~C[3] & C[4]);
    R[4][3] = C[3] ^ (~C[4] & C[0]);
    R[4][4] = C[4] ^ (~C[0] & C[1]);
#endif
}

static void KeccakF1600_c(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) {
    uint64_t T[KECCAK1600_ROWS][KECCAK1600_ROWS];
    size_t i;

#ifdef KECCAK_COMPLEMENTING_TRANSFORM
    A[0][1] = ~A[0][1];
    A[0][2] = ~A[0][2];
    A[1][3] = ~A[1][3];
    A[2][2] = ~A[2][2];
    A[3][2] = ~A[3][2];
    A[4][0] = ~A[4][0];
#endif

    for (i = 0; i < 24; i += 2) {
        Round(T, A, i);
        Round(A, T, i + 1);
    }

#ifdef KECCAK_COMPLEMENTING_TRANSFORM
    A[0][1] = ~A[0][1];
    A[0][2] = ~A[0][2];
    A[1][3] = ~A[1][3];
    A[2][2] = ~A[2][2];
    A[3][2] = ~A[3][2];
    A[4][0] = ~A[4][0];
#endif
}
#endif // !KECCAK1600_ASM

// Forward declaration for KeccakF1600 function
void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]);

 // Keccak1600_Absorb can be called multiple times; at each invocation the
 // largest multiple of |r| out of |len| bytes are processed. The
 // remaining amount of bytes is returned. This is done to spare caller
 // trouble of calculating the largest multiple of |r|. |r| can be viewed
 // as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
 // 72, but can also be (1600 - 448)/8 = 144. All this means that message
 // padding and intermediate sub-block buffering, byte- or bitwise, is
 // caller's responsibility.

// KeccakF1600_XORBytes XORs |len| bytes from |inp| into the Keccak state |A|.
// |len| must be a multiple of 8.
static void KeccakF1600_XORBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp, size_t len) {
    assert(len <= SHA3_MAX_BLOCKSIZE);
    assert((len % 8) == 0);

    uint64_t *A_flat = (uint64_t *)A;
    size_t w = len / 8;

    for (size_t i = 0; i < w; i++) {
        uint64_t Ai = (uint64_t)inp[0]       | (uint64_t)inp[1] << 8  |
                      (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
                      (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
                      (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
        inp += 8;
        A_flat[i] ^= Ai;
    }
}

size_t Keccak1600_Absorb(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], const uint8_t *inp, size_t len,
                         size_t r) {
    assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);

    while (len >= r) {
        KeccakF1600_XORBytes(A, inp, r);
        KeccakF1600(A);
        inp += r;
        len -= r;
    }

    return len;
}

// KeccakF1600_ExtractBytes extracts |len| bytes from the Keccak state |A| into |out|.
// This function operates on up to block_size bytes (a single block). For extracting
// more data, the state must be processed again through KeccakF1600 (see Keccak1600_Squeeze).
static void KeccakF1600_ExtractBytes(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out, size_t len) {
    uint64_t *A_flat = (uint64_t *)A;
    assert(len <= SHA3_MAX_BLOCKSIZE);
    size_t i = 0;

    while (len != 0) {
        uint64_t Ai = A_flat[i];

        if (len < 8) {
            for (size_t j = 0; j < len; j++) {
                *out++ = (uint8_t)Ai;
                Ai >>= 8;
            }
            return;
        }

        out[0] = (uint8_t)(Ai);
        out[1] = (uint8_t)(Ai >> 8);
        out[2] = (uint8_t)(Ai >> 16);
        out[3] = (uint8_t)(Ai >> 24);
        out[4] = (uint8_t)(Ai >> 32);
        out[5] = (uint8_t)(Ai >> 40);
        out[6] = (uint8_t)(Ai >> 48);
        out[7] = (uint8_t)(Ai >> 56);
        out += 8;
        len -= 8;
        i++;
    }
}

void Keccak1600_Squeeze(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out, size_t len, size_t r, int padded) {
    assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);

    while (len != 0) {
        if (padded) {
            KeccakF1600(A);
        }
        padded = 1;

        size_t extract_len = len < r ? len : r;
        KeccakF1600_ExtractBytes(A, out, extract_len);
        out += extract_len;
        len -= extract_len;
    }
}

#if defined(KECCAK1600_ASM)

// Scalar implementation from OpenSSL provided by keccak1600-armv8.pl
extern void KeccakF1600_hw(uint64_t state[25]);

#if defined(OPENSSL_AARCH64)
static void keccak_log_dispatch(size_t id) {
#if BORINGSSL_DISPATCH_TEST
    BORINGSSL_function_hit[id] = 1;
#endif
}
#endif

void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS]) {
    // Dispatch logic for Keccak-x1 on AArch64:
    //
    // 1. If ASM is disabled, we use the C implementation.
    // 2. If ASM is enabled:
    //   - For Neoverse N1, V1, V2, we use scalar Keccak assembly from s2n-bignum
    //     (`sha3_keccak_f1600()`)
    //     leveraging lazy rotations from https://eprint.iacr.org/2022/1243.
    //   - Otherwise, if the Neon SHA3 extension is supported, we use the Neon
    //     Keccak assembly from s2n-bignum (`sha3_keccak_f1600_alt()`),
    //     leveraging that extension.
    //   - Otherwise, fall back to scalar Keccak implementation from OpenSSL,
    //     (`Keccak1600_hw()`), not using lazy rotations.
    //
    // Lazy rotations improve performance by up to 10% on CPUs with free
    // Barrel shifting, which includes Neoverse N1, V1, and V2. Not all
    // CPUs have free Barrel shifting (e.g. Apple M1 or Cortex-A72), so we
    // don't use it by default.
    //
    // Neoverse V1 and V2 do support SHA3 instructions, but they are only
    // implemented on 1/4 of Neon units, and are thus slower than a scalar
    // implementation.
#if defined(OPENSSL_AARCH64)
#if defined(KECCAK1600_S2N_BIGNUM_ASM)
    if (CRYPTO_is_Neoverse_N1() || CRYPTO_is_Neoverse_V1() || CRYPTO_is_Neoverse_V2()) {
        keccak_log_dispatch(10); // kFlag_sha3_keccak_f1600
        sha3_keccak_f1600((uint64_t *)A, iotas);
        return;
    }

#if defined(MY_ASSEMBLER_SUPPORTS_NEON_SHA3_EXTENSION)
    if (CRYPTO_is_ARMv8_SHA3_capable()) {
        keccak_log_dispatch(11); // kFlag_sha3_keccak_f1600_alt
        sha3_keccak_f1600_alt((uint64_t *)A, iotas);
        return;
    }
#endif
#endif

    keccak_log_dispatch(9); // kFlag_KeccakF1600_hw
    KeccakF1600_hw((uint64_t *) A);

#elif defined(OPENSSL_X86_64)
    sha3_keccak_f1600((uint64_t *)A, iotas);
#endif
}

#else // KECCAK1600_ASM

void KeccakF1600(uint64_t A[KECCAK1600_ROWS][KECCAK1600_ROWS])
{
    KeccakF1600_c(A);
}

#endif // !KECCAK1600_ASM

// KeccakF1600_XORBytes_x4 XORs |len| bytes from |inp0|, |inp1|, |inp2|, |inp3|
// into the four Keccak states in |A|. |len| must be a multiple of 8.
static void KeccakF1600_XORBytes_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
                                    const uint8_t *inp0, const uint8_t *inp1,
                                    const uint8_t *inp2, const uint8_t *inp3,
                                    size_t len) {
    KeccakF1600_XORBytes(A[0], inp0, len);
    KeccakF1600_XORBytes(A[1], inp1, len);
    KeccakF1600_XORBytes(A[2], inp2, len);
    KeccakF1600_XORBytes(A[3], inp3, len);
}

// KeccakF1600_ExtractBytes_x4 extracts |len| bytes from the four Keccak states in |A|
// into |out0|, |out1|, |out2|, |out3|.
static void KeccakF1600_ExtractBytes_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
                                        uint8_t *out0, uint8_t *out1,
                                        uint8_t *out2, uint8_t *out3,
                                        size_t len) {
    KeccakF1600_ExtractBytes(A[0], out0, len);
    KeccakF1600_ExtractBytes(A[1], out1, len);
    KeccakF1600_ExtractBytes(A[2], out2, len);
    KeccakF1600_ExtractBytes(A[3], out3, len);
}

static void Keccak1600_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS]) {
    // Dispatch logic for Keccak-x4 on AArch64:
    //
    // 1. If ASM is disabled, we use 4x the C implementation.
    // 2. If ASM is enabled:
    // - For Neoverse N1, we use scalar batched hybrid Keccak assembly from s2n-bignum
    //   (`sha3_keccak4_f1600_alt()`) leveraging Neon and scalar assembly with
    //   lazy rotations.
    // - For Neoverse V1, V2, we use SIMD batched hybrid Keccak assembly from s2n-bignum
    //   (`sha3_keccak4_f1600_alt2()`) leveraging Neon, Neon SHA3 extension,
    //   and scalar assembly with lazy rotations.
    // - Otherwise, if the Neon SHA3 extension is supported, we use the 2-fold
    //   Keccak assembly from s2n-bignum (`sha3_keccak2_f1600()`) twice,
    //   which is a straightforward implementation using the SHA3 extension.
    // - Otherwise, fall back to four times the 1-fold Keccak implementation
    //   (which has its own dispatch logic).
#if defined(KECCAK1600_S2N_BIGNUM_ASM) && defined(OPENSSL_AARCH64)
    if (CRYPTO_is_Neoverse_N1()) {
        keccak_log_dispatch(13); // kFlag_sha3_keccak4_f1600_alt
        sha3_keccak4_f1600_alt((uint64_t *)A, iotas);
        return;
    }

#if defined(MY_ASSEMBLER_SUPPORTS_NEON_SHA3_EXTENSION)
    if (CRYPTO_is_Neoverse_V1() || CRYPTO_is_Neoverse_V2()) {
        keccak_log_dispatch(14); // kFlag_sha3_keccak4_f1600_alt2
        sha3_keccak4_f1600_alt2((uint64_t *)A, iotas);
        return;
    }

    if (CRYPTO_is_ARMv8_SHA3_capable()) {
        keccak_log_dispatch(12); // kFlag_sha3_keccak2_f1600
        // Use 2-fold function twice: A[0:1] and A[2:3]
        sha3_keccak2_f1600((uint64_t *)&A[0], iotas);
        sha3_keccak2_f1600((uint64_t *)&A[2], iotas);
        return;
    }
#endif
#endif

    // Fallback: 4x individual KeccakF1600 calls (each with their own dispatch)
    KeccakF1600(A[0]);
    KeccakF1600(A[1]);
    KeccakF1600(A[2]);
    KeccakF1600(A[3]);
}

// One-shot absorb + finalize. Note that in contract to non-batched Keccak,
// this does _not_ run a Keccak permutation at the end, allowing for a uniform
// implementation of Keccak1600_Squeezeblocks_x4() without `padded` parameter
// as in the non-batched implementation.
void Keccak1600_Absorb_once_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS],
                               const uint8_t *inp0, const uint8_t *inp1,
                               const uint8_t *inp2, const uint8_t *inp3,
                               size_t len, size_t r, uint8_t p) {
    assert(r <= SHA3_MAX_BLOCKSIZE);

    while (len >= r) {
        KeccakF1600_XORBytes_x4(A, inp0, inp1, inp2, inp3, r);
        Keccak1600_x4(A);
        inp0 += r;
        inp1 += r;
        inp2 += r;
        inp3 += r;
        len -= r;
    }

    // Build 16-byte aligned final blocks for each input
    alignas(16) uint8_t final[4][SHA3_MAX_BLOCKSIZE] = {{0}};

    // Copy the remainder bytes to final blocks
    OPENSSL_memcpy(final[0], inp0, len);
    OPENSSL_memcpy(final[1], inp1, len);
    OPENSSL_memcpy(final[2], inp2, len);
    OPENSSL_memcpy(final[3], inp3, len);

    if (len == r - 1) {
        p |= 128;
    } else {
        final[0][r - 1] |= 128;
        final[1][r - 1] |= 128;
        final[2][r - 1] |= 128;
        final[3][r - 1] |= 128;
    }

    final[0][len] |= p;
    final[1][len] |= p;
    final[2][len] |= p;
    final[3][len] |= p;

    KeccakF1600_XORBytes_x4(A, final[0], final[1], final[2], final[3], r);

    // Clean up final blocks to avoid stack leakage
    OPENSSL_cleanse(final, sizeof(final));
}

void Keccak1600_Squeezeblocks_x4(uint64_t A[4][KECCAK1600_ROWS][KECCAK1600_ROWS], uint8_t *out0, uint8_t *out1,
                                 uint8_t *out2, uint8_t *out3,
                                 size_t num_blocks, size_t r) {
    while (num_blocks != 0) {
        Keccak1600_x4(A);
        KeccakF1600_ExtractBytes_x4(A, out0, out1, out2, out3, r);

        out0 += r;
        out1 += r;
        out2 += r;
        out3 += r;
        num_blocks--;
    }
}