chore: checkpoint before Python removal

This commit is contained in:
2026-03-26 22:33:59 +00:00
parent 683cec9307
commit e568ddf82a
29972 changed files with 11269302 additions and 2 deletions

View File

@@ -0,0 +1,64 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC
#ifndef MLK_AWSLC_FIPS202_GLUE_H
#define MLK_AWSLC_FIPS202_GLUE_H
#include <stddef.h>
#include <stdint.h>
#include "../sha/internal.h"
#define SHAKE128_RATE 168
#define SHAKE256_RATE 136
#define SHA3_256_RATE 136
#define SHA3_384_RATE 104
#define SHA3_512_RATE 72
#define mlk_shake128ctx KECCAK1600_CTX
static MLK_INLINE void mlk_shake128_init(mlk_shake128ctx *state) {
// Return code checks can be omitted
// SHAKE_Init always returns 1 when called with correct block size value.
(void) SHAKE_Init(state, SHAKE128_BLOCKSIZE);
}
static MLK_INLINE void mlk_shake128_release(mlk_shake128ctx *state) {
(void) state;
}
static MLK_INLINE void mlk_shake128_absorb_once(mlk_shake128ctx *state,
const uint8_t *input, size_t inlen) {
// Return code check can be omitted
// since mlkem-native adheres to call discipline
(void) SHAKE_Absorb(state, input, inlen);
}
static MLK_INLINE void mlk_shake128_squeezeblocks(uint8_t *output, size_t nblocks,
mlk_shake128ctx *state) {
// Return code check can be omitted
// since mlkem-native adheres to call discipline
(void) SHAKE_Squeeze(output, state, nblocks * SHAKE128_RATE);
}
static MLK_INLINE void mlk_shake256(uint8_t *output, size_t outlen,
const uint8_t *input, size_t inlen) {
// Return code check can be omitted
// since mlkem-native adheres to call discipline
(void) SHAKE256(input, inlen, output, outlen);
}
static MLK_INLINE void mlk_sha3_256(uint8_t *output, const uint8_t *input,
size_t inlen) {
// Return code check can be omitted
// since mlkem-native adheres to call discipline
(void) SHA3_256(input, inlen, output);
}
static MLK_INLINE void mlk_sha3_512(uint8_t *output, const uint8_t *input,
size_t inlen) {
// Return code check can be omitted
// since mlkem-native adheres to call discipline
(void) SHA3_512(input, inlen, output);
}
#endif // MLK_AWSLC_FIPS202_GLUE_H

View File

@@ -0,0 +1,58 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC
//
// This is a shim establishing the FIPS-202 API required by
// mlkem-native from the API exposed by AWS-LC.
//
#ifndef MLK_AWSLC_FIPS202X4_GLUE_H
#define MLK_AWSLC_FIPS202X4_GLUE_H
#include <stddef.h>
#include <stdint.h>
#include "fips202_glue.h"
#define mlk_shake128x4ctx KECCAK1600_CTX_x4
static MLK_INLINE void mlk_shake128x4_absorb_once(mlk_shake128x4ctx *state,
const uint8_t *in0,
const uint8_t *in1,
const uint8_t *in2,
const uint8_t *in3, size_t inlen) {
// Return code check can be omitted
// since mlkem-native adheres to call discipline
(void) SHAKE128_Absorb_once_x4(state, in0, in1, in2, in3, inlen);
}
static MLK_INLINE void mlk_shake128x4_squeezeblocks(uint8_t *out0, uint8_t *out1,
uint8_t *out2, uint8_t *out3,
size_t nblocks,
mlk_shake128x4ctx *state) {
// Return code check can be omitted
// since mlkem-native adheres to call discipline
(void) SHAKE128_Squeezeblocks_x4(out0, out1, out2, out3, state, nblocks);
}
static MLK_INLINE void mlk_shake128x4_init(mlk_shake128x4ctx *state) {
// Return code check can be omitted
// since mlkem-native adheres to call discipline
(void) SHAKE128_Init_x4(state);
}
static MLK_INLINE void mlk_shake128x4_release(mlk_shake128x4ctx *state) {
(void) state;
}
static MLK_INLINE void mlk_shake256x4(uint8_t *out0, uint8_t *out1, uint8_t *out2,
uint8_t *out3, size_t outlen, uint8_t *in0,
uint8_t *in1, uint8_t *in2, uint8_t *in3,
size_t inlen) {
// Return code check can be omitted
// since SHAKE256_x4 is documented not to fail for valid inputs.
(void) SHAKE256_x4(in0, in1, in2, in3, inlen,
out0, out1, out2, out3, outlen);
}
#endif // MLK_AWSLC_FIPS202X4_GLUE_H

View File

@@ -0,0 +1,438 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC
// mlkem-native source code
// Include level-independent code
#define MLK_CONFIG_FILE "../mlkem_native_config.h"
#define MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS
// MLKEM-512
#define MLK_CONFIG_PARAMETER_SET 512
#define MLK_CONFIG_MULTILEVEL_WITH_SHARED // Include level-independent code
#include "mlkem/mlkem_native_bcm.c"
// MLKEM-768
#undef MLK_CONFIG_PARAMETER_SET
#define MLK_CONFIG_PARAMETER_SET 768
#define MLK_CONFIG_MULTILEVEL_NO_SHARED // Exclude level-inpendent code
#include "mlkem/mlkem_native_bcm.c"
// MLKEM-1024
#undef MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS
#undef MLK_CONFIG_PARAMETER_SET
#define MLK_CONFIG_PARAMETER_SET 1024
#include "mlkem/mlkem_native_bcm.c"
// End of mlkem-native source code
#include "./ml_kem.h"
typedef struct {
uint8_t *buffer;
size_t *length;
const size_t expected_length;
} output_buffer;
// Ensure buffer is long enough
static int check_buffer(const output_buffer data) {
if (data.buffer == NULL || *data.length < data.expected_length) {
return 0;
}
return 1;
}
// EVP layer assumes the length parameter passed in will be set to the number of bytes written if call is successful
static void set_written_len_on_success(const int result, output_buffer data) {
if (result == 0) {
*data.length = data.expected_length;
}
}
int ml_kem_common_keypair(int (*keypair)(uint8_t * public_key, uint8_t *secret_key),
output_buffer public_key,
output_buffer secret_key);
int ml_kem_common_encapsulate_deterministic(int (*encapsulate)(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key, const uint8_t *seed),
output_buffer ciphertext,
output_buffer shared_secret,
const uint8_t *public_key,
const uint8_t *seed);
int ml_kem_common_encapsulate(int (*encapsulate)(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key),
output_buffer ciphertext,
output_buffer shared_secret,
const uint8_t *public_key);
int ml_kem_common_decapsulate(int (*decapsulate)(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key),
output_buffer shared_secret,
const uint8_t *ciphertext,
const uint8_t *secret_key);
// Note: These methods currently default to using the reference code for ML_KEM.
// In a future where AWS-LC has optimized options available, those can be
// conditionally (or based on compile-time flags) called here, depending on
// platform support.
int ml_kem_512_keypair_deterministic(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */) {
boringssl_ensure_ml_kem_self_test();
return ml_kem_512_keypair_deterministic_no_self_test(
public_key, public_len, secret_key, secret_len, seed);
}
int ml_kem_512_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */) {
output_buffer pkey = {public_key, public_len, MLKEM512_PUBLIC_KEY_BYTES};
output_buffer skey = {secret_key, secret_len, MLKEM512_SECRET_KEY_BYTES};
if (!check_buffer(pkey) || !check_buffer(skey)) {
return 1;
}
const int res = mlkem512_keypair_derand(pkey.buffer, skey.buffer, seed);
#if defined(AWSLC_FIPS)
/* PCT failure is the only failure condition for key generation. */
if (res != 0) {
AWS_LC_FIPS_failure("ML-KEM keygen PCT failed");
}
#endif
set_written_len_on_success(res, pkey);
set_written_len_on_success(res, skey);
return res;
}
int ml_kem_512_keypair(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */) {
output_buffer pkey = {public_key, public_len, MLKEM512_PUBLIC_KEY_BYTES};
output_buffer skey = {secret_key, secret_len, MLKEM512_SECRET_KEY_BYTES};
return ml_kem_common_keypair(mlkem512_keypair, pkey, skey);
}
int ml_kem_512_encapsulate_deterministic(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */) {
boringssl_ensure_ml_kem_self_test();
return ml_kem_512_encapsulate_deterministic_no_self_test(ciphertext, ciphertext_len, shared_secret, shared_secret_len, public_key, seed);
}
int ml_kem_512_encapsulate_deterministic_no_self_test(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */) {
output_buffer ctext = {ciphertext, ciphertext_len, MLKEM512_CIPHERTEXT_BYTES};
output_buffer ss = {shared_secret, shared_secret_len, MLKEM512_SHARED_SECRET_LEN};
return ml_kem_common_encapsulate_deterministic(mlkem512_enc_derand, ctext, ss, public_key, seed);
}
int ml_kem_512_encapsulate(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */) {
output_buffer ctext = {ciphertext, ciphertext_len, MLKEM512_CIPHERTEXT_BYTES};
output_buffer ss = {shared_secret, shared_secret_len, MLKEM512_SHARED_SECRET_LEN};
return ml_kem_common_encapsulate(mlkem512_enc, ctext, ss, public_key);
}
int ml_kem_512_decapsulate(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */) {
boringssl_ensure_ml_kem_self_test();
return ml_kem_512_decapsulate_no_self_test(shared_secret, shared_secret_len, ciphertext, secret_key);
}
int ml_kem_512_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */) {
output_buffer ss = {shared_secret, shared_secret_len, MLKEM512_SHARED_SECRET_LEN};
return ml_kem_common_decapsulate(mlkem512_dec, ss, ciphertext, secret_key);
}
int ml_kem_768_keypair_deterministic(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */) {
boringssl_ensure_ml_kem_self_test();
return ml_kem_768_keypair_deterministic_no_self_test(public_key, public_len, secret_key, secret_len, seed);
}
int ml_kem_768_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */) {
output_buffer pkey = {public_key, public_len, MLKEM768_PUBLIC_KEY_BYTES};
output_buffer skey = {secret_key, secret_len, MLKEM768_SECRET_KEY_BYTES};
if (!check_buffer(pkey) || !check_buffer(skey)) {
return 1;
}
const int res = mlkem768_keypair_derand(pkey.buffer, skey.buffer, seed);
#if defined(AWSLC_FIPS)
/* PCT failure is the only failure condition for key generation. */
if (res != 0) {
AWS_LC_FIPS_failure("ML-KEM keygen PCT failed");
}
#endif
set_written_len_on_success(res, pkey);
set_written_len_on_success(res, skey);
return res;
}
int ml_kem_768_keypair(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */) {
output_buffer pkey = {public_key, public_len, MLKEM768_PUBLIC_KEY_BYTES};
output_buffer skey = {secret_key, secret_len, MLKEM768_SECRET_KEY_BYTES};
return ml_kem_common_keypair(mlkem768_keypair, pkey, skey);
}
int ml_kem_768_encapsulate_deterministic(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */) {
boringssl_ensure_ml_kem_self_test();
return ml_kem_768_encapsulate_deterministic_no_self_test(ciphertext, ciphertext_len, shared_secret, shared_secret_len, public_key, seed);
}
int ml_kem_768_encapsulate_deterministic_no_self_test(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */) {
output_buffer ctext = {ciphertext, ciphertext_len, MLKEM768_CIPHERTEXT_BYTES};
output_buffer ss = {shared_secret, shared_secret_len, MLKEM768_SHARED_SECRET_LEN};
return ml_kem_common_encapsulate_deterministic(mlkem768_enc_derand, ctext, ss, public_key, seed);
}
int ml_kem_768_encapsulate(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */) {
output_buffer ctext = {ciphertext, ciphertext_len, MLKEM768_CIPHERTEXT_BYTES};
output_buffer ss = {shared_secret, shared_secret_len, MLKEM768_SHARED_SECRET_LEN};
return ml_kem_common_encapsulate(mlkem768_enc, ctext, ss, public_key);
}
int ml_kem_768_decapsulate(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */) {
boringssl_ensure_ml_kem_self_test();
return ml_kem_768_decapsulate_no_self_test(shared_secret, shared_secret_len, ciphertext, secret_key);
}
int ml_kem_768_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */) {
output_buffer ss = {shared_secret, shared_secret_len, MLKEM768_SHARED_SECRET_LEN};
return ml_kem_common_decapsulate(mlkem768_dec, ss, ciphertext, secret_key);
}
int ml_kem_1024_keypair_deterministic(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */) {
boringssl_ensure_ml_kem_self_test();
return ml_kem_1024_keypair_deterministic_no_self_test(public_key, public_len, secret_key, secret_len, seed);
}
int ml_kem_1024_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */) {
output_buffer pkey = {public_key, public_len, MLKEM1024_PUBLIC_KEY_BYTES};
output_buffer skey = {secret_key, secret_len, MLKEM1024_SECRET_KEY_BYTES};
if (!check_buffer(pkey) || !check_buffer(skey)) {
return 1;
}
const int res = mlkem1024_keypair_derand(pkey.buffer, skey.buffer, seed);
#if defined(AWSLC_FIPS)
/* PCT failure is the only failure condition for key generation. */
if (res != 0) {
AWS_LC_FIPS_failure("ML-KEM keygen PCT failed");
}
#endif
set_written_len_on_success(res, pkey);
set_written_len_on_success(res, skey);
return res;
}
int ml_kem_1024_keypair(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */) {
output_buffer pkey = {public_key, public_len, MLKEM1024_PUBLIC_KEY_BYTES};
output_buffer skey = {secret_key, secret_len, MLKEM1024_SECRET_KEY_BYTES};
return ml_kem_common_keypair(mlkem1024_keypair, pkey, skey);
}
int ml_kem_1024_encapsulate_deterministic(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */) {
boringssl_ensure_ml_kem_self_test();
return ml_kem_1024_encapsulate_deterministic_no_self_test(ciphertext, ciphertext_len, shared_secret, shared_secret_len, public_key, seed);
}
int ml_kem_1024_encapsulate_deterministic_no_self_test(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */) {
output_buffer ctext = {ciphertext, ciphertext_len, MLKEM1024_CIPHERTEXT_BYTES};
output_buffer ss = {shared_secret, shared_secret_len, MLKEM1024_SHARED_SECRET_LEN};
return ml_kem_common_encapsulate_deterministic(mlkem1024_enc_derand, ctext, ss, public_key, seed);
}
int ml_kem_1024_encapsulate(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */) {
output_buffer ctext = {ciphertext, ciphertext_len, MLKEM1024_CIPHERTEXT_BYTES};
output_buffer ss = {shared_secret, shared_secret_len, MLKEM1024_SHARED_SECRET_LEN};
return ml_kem_common_encapsulate(mlkem1024_enc, ctext, ss, public_key);
}
int ml_kem_1024_decapsulate(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */) {
boringssl_ensure_ml_kem_self_test();
return ml_kem_1024_decapsulate_no_self_test(shared_secret, shared_secret_len, ciphertext, secret_key);
}
int ml_kem_1024_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */) {
output_buffer ss = {shared_secret, shared_secret_len, MLKEM1024_SHARED_SECRET_LEN};
return ml_kem_common_decapsulate(mlkem1024_dec, ss, ciphertext, secret_key);
}
int ml_kem_common_keypair(int (*keypair)(uint8_t * public_key, uint8_t *secret_key),
output_buffer public_key,
output_buffer secret_key) {
boringssl_ensure_ml_kem_self_test();
if (!check_buffer(public_key) || !check_buffer(secret_key)) {
return 1;
}
const int res = keypair(public_key.buffer, secret_key.buffer);
#if defined(AWSLC_FIPS)
/* PCT failure is the only failure condition for key generation. */
if (res != 0) {
AWS_LC_FIPS_failure("ML-KEM keygen PCT failed");
}
#endif
set_written_len_on_success(res, public_key);
set_written_len_on_success(res, secret_key);
return res;
}
int ml_kem_common_encapsulate_deterministic(int (*encapsulate)(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key, const uint8_t *seed),
output_buffer ciphertext,
output_buffer shared_secret,
const uint8_t *public_key,
const uint8_t *seed) {
if (!check_buffer(ciphertext) || !check_buffer(shared_secret)) {
return 1;
}
const int res = encapsulate(ciphertext.buffer, shared_secret.buffer, public_key, seed);
set_written_len_on_success(res, ciphertext);
set_written_len_on_success(res, shared_secret);
return res;
}
int ml_kem_common_encapsulate(int (*encapsulate)(uint8_t *ciphertext, uint8_t *shared_secret, const uint8_t *public_key),
output_buffer ciphertext,
output_buffer shared_secret,
const uint8_t *public_key) {
boringssl_ensure_ml_kem_self_test();
if (!check_buffer(ciphertext) || !check_buffer(shared_secret)) {
return 1;
}
const int res = encapsulate(ciphertext.buffer, shared_secret.buffer, public_key);
set_written_len_on_success(res, ciphertext);
set_written_len_on_success(res, shared_secret);
return res;
}
int ml_kem_common_decapsulate(int (*decapsulate)(uint8_t *shared_secret, const uint8_t *ciphertext, const uint8_t *secret_key),
output_buffer shared_secret,
const uint8_t *ciphertext,
const uint8_t *secret_key) {
if (!check_buffer(shared_secret)) {
return 1;
}
const int res = decapsulate(shared_secret.buffer, ciphertext, secret_key);
set_written_len_on_success(res, shared_secret);
return res;
}
int ml_kem_512_check_pk(const uint8_t *public_key, size_t public_key_len) {
if (public_key == NULL || public_key_len != MLKEM512_PUBLIC_KEY_BYTES) {
return -1;
}
return mlkem512_check_pk(public_key);
}
int ml_kem_512_check_sk(const uint8_t *secret_key, size_t secret_key_len) {
if (secret_key == NULL || secret_key_len != MLKEM512_SECRET_KEY_BYTES) {
return -1;
}
return mlkem512_check_sk(secret_key);
}
int ml_kem_768_check_pk(const uint8_t *public_key, size_t public_key_len) {
if (public_key == NULL || public_key_len != MLKEM768_PUBLIC_KEY_BYTES) {
return -1;
}
return mlkem768_check_pk(public_key);
}
int ml_kem_768_check_sk(const uint8_t *secret_key, size_t secret_key_len) {
if (secret_key == NULL || secret_key_len != MLKEM768_SECRET_KEY_BYTES) {
return -1;
}
return mlkem768_check_sk(secret_key);
}
int ml_kem_1024_check_pk(const uint8_t *public_key, size_t public_key_len) {
if (public_key == NULL || public_key_len != MLKEM1024_PUBLIC_KEY_BYTES) {
return -1;
}
return mlkem1024_check_pk(public_key);
}
int ml_kem_1024_check_sk(const uint8_t *secret_key, size_t secret_key_len) {
if (secret_key == NULL || secret_key_len != MLKEM1024_SECRET_KEY_BYTES) {
return -1;
}
return mlkem1024_check_sk(secret_key);
}

View File

@@ -0,0 +1,198 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC
#ifndef ML_KEM_H
#define ML_KEM_H
#include <stdint.h>
#include <openssl/base.h>
#define MLKEM512_SHARED_SECRET_LEN (32)
#define MLKEM512_KEYGEN_SEED_LEN (64)
#define MLKEM512_ENCAPS_SEED_LEN (32)
#define MLKEM512_PUBLIC_KEY_BYTES (800)
#define MLKEM512_SECRET_KEY_BYTES (1632)
#define MLKEM512_CIPHERTEXT_BYTES (768)
#define MLKEM768_SHARED_SECRET_LEN (32)
#define MLKEM768_KEYGEN_SEED_LEN (64)
#define MLKEM768_ENCAPS_SEED_LEN (32)
#define MLKEM768_PUBLIC_KEY_BYTES (1184)
#define MLKEM768_SECRET_KEY_BYTES (2400)
#define MLKEM768_CIPHERTEXT_BYTES (1088)
#define MLKEM1024_SHARED_SECRET_LEN (32)
#define MLKEM1024_KEYGEN_SEED_LEN (64)
#define MLKEM1024_ENCAPS_SEED_LEN (32)
#define MLKEM1024_PUBLIC_KEY_BYTES (1568)
#define MLKEM1024_SECRET_KEY_BYTES (3168)
#define MLKEM1024_CIPHERTEXT_BYTES (1568)
#if defined(__cplusplus)
extern "C" {
#endif
OPENSSL_EXPORT int ml_kem_512_keypair_deterministic(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */);
int ml_kem_512_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */);
OPENSSL_EXPORT int ml_kem_512_keypair(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */);
OPENSSL_EXPORT int ml_kem_512_encapsulate_deterministic(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */);
int ml_kem_512_encapsulate_deterministic_no_self_test(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */);
OPENSSL_EXPORT int ml_kem_512_encapsulate(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */);
OPENSSL_EXPORT int ml_kem_512_decapsulate(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */);
int ml_kem_512_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */);
OPENSSL_EXPORT int ml_kem_512_check_pk(const uint8_t *public_key /* IN */,
size_t public_key_len /* IN */);
OPENSSL_EXPORT int ml_kem_512_check_sk(const uint8_t *secret_key /* IN */,
size_t secret_key_len /* IN */);
OPENSSL_EXPORT int ml_kem_768_keypair_deterministic(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */);
int ml_kem_768_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */);
OPENSSL_EXPORT int ml_kem_768_keypair(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */);
OPENSSL_EXPORT int ml_kem_768_encapsulate_deterministic(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */);
int ml_kem_768_encapsulate_deterministic_no_self_test(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */);
OPENSSL_EXPORT int ml_kem_768_encapsulate(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */);
OPENSSL_EXPORT int ml_kem_768_decapsulate(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */);
int ml_kem_768_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */);
OPENSSL_EXPORT int ml_kem_768_check_pk(const uint8_t *public_key /* IN */,
size_t public_key_len /* IN */);
OPENSSL_EXPORT int ml_kem_768_check_sk(const uint8_t *secret_key /* IN */,
size_t secret_key_len /* IN */);
OPENSSL_EXPORT int ml_kem_1024_keypair_deterministic(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */);
int ml_kem_1024_keypair_deterministic_no_self_test(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */,
const uint8_t *seed /* IN */);
OPENSSL_EXPORT int ml_kem_1024_keypair(uint8_t *public_key /* OUT */,
size_t *public_len /* IN_OUT */,
uint8_t *secret_key /* OUT */,
size_t *secret_len /* IN_OUT */);
OPENSSL_EXPORT int ml_kem_1024_encapsulate_deterministic(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */);
int ml_kem_1024_encapsulate_deterministic_no_self_test(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */,
const uint8_t *seed /* IN */);
OPENSSL_EXPORT int ml_kem_1024_encapsulate(uint8_t *ciphertext /* OUT */,
size_t *ciphertext_len /* IN_OUT */,
uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *public_key /* IN */);
OPENSSL_EXPORT int ml_kem_1024_decapsulate(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */);
int ml_kem_1024_decapsulate_no_self_test(uint8_t *shared_secret /* OUT */,
size_t *shared_secret_len /* IN_OUT */,
const uint8_t *ciphertext /* IN */,
const uint8_t *secret_key /* IN */);
OPENSSL_EXPORT int ml_kem_1024_check_pk(const uint8_t *public_key /* IN */,
size_t public_key_len /* IN */);
OPENSSL_EXPORT int ml_kem_1024_check_sk(const uint8_t *secret_key /* IN */,
size_t secret_key_len /* IN */);
#if defined(__cplusplus)
}
#endif
#endif // ML_KEM_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,164 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#ifndef MLK_CBMC_H
#define MLK_CBMC_H
/***************************************************
* Basic replacements for __CPROVER_XXX contracts
***************************************************/
#ifndef CBMC
#define __contract__(x)
#define __loop__(x)
#else /* !CBMC */
#define __contract__(x) x
#define __loop__(x) x
/* https://diffblue.github.io/cbmc/contracts-assigns.html */
#define assigns(...) __CPROVER_assigns(__VA_ARGS__)
/* https://diffblue.github.io/cbmc/contracts-requires-ensures.html */
#define requires(...) __CPROVER_requires(__VA_ARGS__)
#define ensures(...) __CPROVER_ensures(__VA_ARGS__)
/* https://diffblue.github.io/cbmc/contracts-loops.html */
#define invariant(...) __CPROVER_loop_invariant(__VA_ARGS__)
#define decreases(...) __CPROVER_decreases(__VA_ARGS__)
/* cassert to avoid confusion with in-built assert */
#define cassert(x) __CPROVER_assert(x, "cbmc assertion failed")
#define assume(...) __CPROVER_assume(__VA_ARGS__)
/***************************************************
* Macros for "expression" forms that may appear
* _inside_ top-level contracts.
***************************************************/
/*
* function return value - useful inside ensures
* https://diffblue.github.io/cbmc/contracts-functions.html
*/
#define return_value (__CPROVER_return_value)
/*
* assigns l-value targets
* https://diffblue.github.io/cbmc/contracts-assigns.html
*/
#define object_whole(...) __CPROVER_object_whole(__VA_ARGS__)
#define memory_slice(...) __CPROVER_object_upto(__VA_ARGS__)
#define same_object(...) __CPROVER_same_object(__VA_ARGS__)
/*
* Pointer-related predicates
* https://diffblue.github.io/cbmc/contracts-memory-predicates.html
*/
#define memory_no_alias(...) __CPROVER_is_fresh(__VA_ARGS__)
#define readable(...) __CPROVER_r_ok(__VA_ARGS__)
#define writeable(...) __CPROVER_w_ok(__VA_ARGS__)
/*
* History variables
* https://diffblue.github.io/cbmc/contracts-history-variables.html
*/
#define old(...) __CPROVER_old(__VA_ARGS__)
#define loop_entry(...) __CPROVER_loop_entry(__VA_ARGS__)
/*
* Quantifiers
* Note that the range on qvar is _exclusive_ between qvar_lb .. qvar_ub
* https://diffblue.github.io/cbmc/contracts-quantifiers.html
*/
/*
* Prevent clang-format from corrupting CBMC's special ==> operator
*/
/* clang-format off */
#define forall(qvar, qvar_lb, qvar_ub, predicate) \
__CPROVER_forall \
{ \
unsigned qvar; \
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> (predicate) \
}
#define EXISTS(qvar, qvar_lb, qvar_ub, predicate) \
__CPROVER_exists \
{ \
unsigned qvar; \
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) && (predicate) \
}
/* clang-format on */
/***************************************************
* Convenience macros for common contract patterns
***************************************************/
/*
* Boolean-value predidate that asserts that "all values of array_var are in
* range value_lb (inclusive) .. value_ub (exclusive)"
* Example:
* array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q)
* expands to
* __CPROVER_forall { int k; (0 <= k && k <= MLKEM_N-1) ==> (
* 0 <= a->coeffs[k]) && a->coeffs[k] < MLKEM_Q)) }
*/
/*
* Prevent clang-format from corrupting CBMC's special ==> operator
*/
/* clang-format off */
#define CBMC_CONCAT_(left, right) left##right
#define CBMC_CONCAT(left, right) CBMC_CONCAT_(left, right)
#define array_bound_core(qvar, qvar_lb, qvar_ub, array_var, \
value_lb, value_ub) \
__CPROVER_forall \
{ \
unsigned qvar; \
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
(((int)(value_lb) <= ((array_var)[(qvar)])) && \
(((array_var)[(qvar)]) < (int)(value_ub))) \
}
#define array_bound(array_var, qvar_lb, qvar_ub, value_lb, value_ub) \
array_bound_core(CBMC_CONCAT(_cbmc_idx, __LINE__), (qvar_lb), \
(qvar_ub), (array_var), (value_lb), (value_ub))
#define array_unchanged_core(qvar, qvar_lb, qvar_ub, array_var) \
__CPROVER_forall \
{ \
unsigned qvar; \
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
((array_var)[(qvar)]) == (old(* (int16_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
}
#define array_unchanged(array_var, N) \
array_unchanged_core(CBMC_CONCAT(_cbmc_idx, __LINE__), 0, (N), (array_var))
#define array_unchanged_u64_core(qvar, qvar_lb, qvar_ub, array_var) \
__CPROVER_forall \
{ \
unsigned qvar; \
((qvar_lb) <= (qvar) && (qvar) < (qvar_ub)) ==> \
((array_var)[(qvar)]) == (old(* (uint64_t (*)[(qvar_ub)])(array_var)))[(qvar)] \
}
#define array_unchanged_u64(array_var, N) \
array_unchanged_u64_core(CBMC_CONCAT(_cbmc_idx, __LINE__), 0, (N), (array_var))
/* clang-format on */
/* Wrapper around array_bound operating on absolute values.
*
* The absolute value bound `k` is exclusive.
*
* Note that since the lower bound in array_bound is inclusive, we have to
* raise it by 1 here.
*/
#define array_abs_bound(arr, lb, ub, k) \
array_bound((arr), (lb), (ub), -((int)(k)) + 1, (k))
#endif /* CBMC */
#endif /* !MLK_CBMC_H */

View File

@@ -0,0 +1,167 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#ifndef MLK_COMMON_H
#define MLK_COMMON_H
#if defined(MLK_CONFIG_FILE)
#include MLK_CONFIG_FILE
#else
#include "config.h"
#endif
#include "params.h"
#include "sys.h"
/* Internal and public API have external linkage by default, but
* this can be overwritten by the user, e.g. for single-CU builds. */
#if !defined(MLK_CONFIG_INTERNAL_API_QUALIFIER)
#define MLK_INTERNAL_API
#else
#define MLK_INTERNAL_API MLK_CONFIG_INTERNAL_API_QUALIFIER
#endif
#if !defined(MLK_CONFIG_EXTERNAL_API_QUALIFIER)
#define MLK_EXTERNAL_API
#else
#define MLK_EXTERNAL_API MLK_CONFIG_EXTERNAL_API_QUALIFIER
#endif
#if defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) || \
defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED)
#define MLK_MULTILEVEL_BUILD
#endif
#define MLK_CONCAT_(x1, x2) x1##x2
#define MLK_CONCAT(x1, x2) MLK_CONCAT_(x1, x2)
#if defined(MLK_MULTILEVEL_BUILD)
#define MLK_ADD_PARAM_SET(s) MLK_CONCAT(s, MLK_CONFIG_PARAMETER_SET)
#else
#define MLK_ADD_PARAM_SET(s) s
#endif
#define MLK_NAMESPACE_PREFIX MLK_CONCAT(MLK_CONFIG_NAMESPACE_PREFIX, _)
#define MLK_NAMESPACE_PREFIX_K \
MLK_CONCAT(MLK_ADD_PARAM_SET(MLK_CONFIG_NAMESPACE_PREFIX), _)
/* Functions are prefixed by MLK_CONFIG_NAMESPACE_PREFIX.
*
* If multiple parameter sets are used, functions depending on the parameter
* set are additionally prefixed with 512/768/1024. See config.h.
*
* Example: If MLK_CONFIG_NAMESPACE_PREFIX is mlkem, then
* MLK_NAMESPACE_K(enc) becomes mlkem512_enc/mlkem768_enc/mlkem1024_enc.
*/
#define MLK_NAMESPACE(s) MLK_CONCAT(MLK_NAMESPACE_PREFIX, s)
#define MLK_NAMESPACE_K(s) MLK_CONCAT(MLK_NAMESPACE_PREFIX_K, s)
/* On Apple platforms, we need to emit leading underscore
* in front of assembly symbols. We thus introducee a separate
* namespace wrapper for ASM symbols. */
#if !defined(__APPLE__)
#define MLK_ASM_NAMESPACE(sym) MLK_NAMESPACE(sym)
#else
#define MLK_ASM_NAMESPACE(sym) MLK_CONCAT(_, MLK_NAMESPACE(sym))
#endif
/*
* On X86_64 if control-flow protections (CET) are enabled (through
* -fcf-protection=), we add an endbr64 instruction at every global function
* label. See sys.h for more details
*/
#if defined(MLK_SYS_X86_64)
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) : MLK_CET_ENDBR
#else
#define MLK_ASM_FN_SYMBOL(sym) MLK_ASM_NAMESPACE(sym) :
#endif
/* We aim to simplify the user's life by supporting builds where
* all source files are included, even those that are not needed.
* Those files are appropriately guarded and will be empty when unneeded.
* The following is to avoid compilers complaining about this. */
#define MLK_EMPTY_CU(s) extern int MLK_NAMESPACE_K(empty_cu_##s);
/* MLK_CONFIG_NO_ASM takes precedence over MLK_USE_NATIVE_XXX */
#if defined(MLK_CONFIG_NO_ASM)
#undef MLK_CONFIG_USE_NATIVE_BACKEND_ARITH
#undef MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202
#endif
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH) && \
!defined(MLK_CONFIG_ARITH_BACKEND_FILE)
#error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_ARITH is set, but MLK_CONFIG_ARITH_BACKEND_FILE is not.
#endif
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202) && \
!defined(MLK_CONFIG_FIPS202_BACKEND_FILE)
#error Bad configuration: MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 is set, but MLK_CONFIG_FIPS202_BACKEND_FILE is not.
#endif
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
#include MLK_CONFIG_ARITH_BACKEND_FILE
/* Include to enforce consistency of API and implementation,
* and conduct sanity checks on the backend.
*
* Keep this _after_ the inclusion of the backend; otherwise,
* the sanity checks won't have an effect. */
#if defined(MLK_CHECK_APIS) && !defined(__ASSEMBLER__)
#include "native/api.h"
#endif
#endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202)
#include MLK_CONFIG_FIPS202_BACKEND_FILE
/* Include to enforce consistency of API and implementation,
* and conduct sanity checks on the backend.
*
* Keep this _after_ the inclusion of the backend; otherwise,
* the sanity checks won't have an effect. */
#if defined(MLK_CHECK_APIS) && !defined(__ASSEMBLER__)
#include "fips202/native/api.h"
#endif
#endif /* MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202 */
#if !defined(MLK_CONFIG_FIPS202_CUSTOM_HEADER)
#define MLK_FIPS202_HEADER_FILE "fips202/fips202.h"
#else
#define MLK_FIPS202_HEADER_FILE MLK_CONFIG_FIPS202_CUSTOM_HEADER
#endif
#if !defined(MLK_CONFIG_FIPS202X4_CUSTOM_HEADER)
#define MLK_FIPS202X4_HEADER_FILE "fips202/fips202x4.h"
#else
#define MLK_FIPS202X4_HEADER_FILE MLK_CONFIG_FIPS202X4_CUSTOM_HEADER
#endif
/* Standard library function replacements */
#if !defined(__ASSEMBLER__)
#if !defined(MLK_CONFIG_CUSTOM_MEMCPY)
#include <string.h>
#define mlk_memcpy memcpy
#endif
#if !defined(MLK_CONFIG_CUSTOM_MEMSET)
#include <string.h>
#define mlk_memset memset
#endif
#endif /* !__ASSEMBLER__ */
/* Just in case we want to include mlkem_native.h, set the configuration
* for that header in accordance with the configuration used here. */
/* Double-check that this is not conflicting with pre-existing definitions. */
#if defined(MLK_CONFIG_API_PARAMETER_SET) || \
defined(MLK_CONFIG_API_NAMESPACE_PREFIX) || \
defined(MLK_CONFIG_API_NO_SUPERCOP) || \
defined(MLK_CONFIG_API_CONSTANTS_ONLY)
#error Pre-existing MLK_CONFIG_API_XXX configuration is neither useful nor allowed during an mlkem-native build
#endif /* MLK_CONFIG_API_PARAMETER_SET || MLK_CONFIG_API_NAMESPACE_PREFIX || \
MLK_CONFIG_API_NO_SUPERCOP || MLK_CONFIG_API_CONSTANTS_ONLY */
#define MLK_CONFIG_API_PARAMETER_SET MLK_CONFIG_PARAMETER_SET
#define MLK_CONFIG_API_NAMESPACE_PREFIX \
MLK_ADD_PARAM_SET(MLK_CONFIG_NAMESPACE_PREFIX)
#endif /* !MLK_COMMON_H */

View File

@@ -0,0 +1,539 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*
* - [REF]
* CRYSTALS-Kyber C reference implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include <stdint.h>
#include <string.h>
#include "cbmc.h"
#include "compress.h"
#include "debug.h"
#include "verify.h"
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
const mlk_poly *a)
{
unsigned i;
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
ret = mlk_poly_compress_d4_native(r, a->coeffs);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
return;
}
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_N / 8; i++)
__loop__(invariant(i <= MLKEM_N / 8))
{
unsigned j;
uint8_t t[8] = {0};
for (j = 0; j < 8; j++)
__loop__(
invariant(i <= MLKEM_N / 8 && j <= 8)
invariant(array_bound(t, 0, j, 0, 16)))
{
t[j] = mlk_scalar_compress_d4(a->coeffs[8 * i + j]);
}
r[i * 4] = t[0] | (t[1] << 4);
r[i * 4 + 1] = t[2] | (t[3] << 4);
r[i * 4 + 2] = t[4] | (t[5] << 4);
r[i * 4 + 3] = t[6] | (t[7] << 4);
}
}
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-{512,768}.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
const mlk_poly *a)
{
unsigned j;
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
ret = mlk_poly_compress_d10_native(r, a->coeffs);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
return;
}
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
for (j = 0; j < MLKEM_N / 4; j++)
__loop__(invariant(j <= MLKEM_N / 4))
{
unsigned k;
uint16_t t[4];
for (k = 0; k < 4; k++)
__loop__(
invariant(k <= 4)
invariant(forall(r, 0, k, t[r] < (1u << 10))))
{
t[k] = mlk_scalar_compress_d10(a->coeffs[4 * j + k]);
}
/*
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 10-bit in size.
*/
r[5 * j + 0] = (t[0] >> 0) & 0xFF;
r[5 * j + 1] = (t[0] >> 8) | ((t[1] << 2) & 0xFF);
r[5 * j + 2] = (t[1] >> 6) | ((t[2] << 4) & 0xFF);
r[5 * j + 3] = (t[2] >> 4) | ((t[3] << 6) & 0xFF);
r[5 * j + 4] = (t[3] >> 2);
}
}
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-{512,768}. */
MLK_INTERNAL_API
void mlk_poly_decompress_d4(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
{
unsigned i;
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
int ret;
ret = mlk_poly_decompress_d4_native(r->coeffs, a);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
return;
}
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(
invariant(i <= MLKEM_N / 2)
invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_Q)))
{
r->coeffs[2 * i + 0] = mlk_scalar_decompress_d4((a[i] >> 0) & 0xF);
r->coeffs[2 * i + 1] = mlk_scalar_decompress_d4((a[i] >> 4) & 0xF);
}
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-{512,768}. */
MLK_INTERNAL_API
void mlk_poly_decompress_d10(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
{
unsigned j;
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
int ret;
ret = mlk_poly_decompress_d10_native(r->coeffs, a);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
return;
}
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
for (j = 0; j < MLKEM_N / 4; j++)
__loop__(
invariant(j <= MLKEM_N / 4)
invariant(array_bound(r->coeffs, 0, 4 * j, 0, MLKEM_Q)))
{
unsigned k;
uint16_t t[4];
uint8_t const *base = &a[5 * j];
t[0] = 0x3FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
t[1] = 0x3FF & ((base[1] >> 2) | ((uint16_t)base[2] << 6));
t[2] = 0x3FF & ((base[2] >> 4) | ((uint16_t)base[3] << 4));
t[3] = 0x3FF & ((base[3] >> 6) | ((uint16_t)base[4] << 2));
for (k = 0; k < 4; k++)
__loop__(
invariant(k <= 4)
invariant(array_bound(r->coeffs, 0, 4 * j + k, 0, MLKEM_Q)))
{
r->coeffs[4 * j + k] = mlk_scalar_decompress_d10(t[k]);
}
}
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
/* Reference: `poly_compress()` in the reference implementation @[REF],
* for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
const mlk_poly *a)
{
unsigned i;
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
ret = mlk_poly_compress_d5_native(r, a->coeffs);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
return;
}
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_N / 8; i++)
__loop__(invariant(i <= MLKEM_N / 8))
{
unsigned j;
uint8_t t[8] = {0};
for (j = 0; j < 8; j++)
__loop__(
invariant(i <= MLKEM_N / 8 && j <= 8)
invariant(array_bound(t, 0, j, 0, 32)))
{
t[j] = mlk_scalar_compress_d5(a->coeffs[8 * i + j]);
}
/*
* Explicitly truncate to avoid warning about
* implicit truncation in CBMC, and use array indexing into
* r rather than pointer-arithmetic to simplify verification
*/
r[i * 5] = 0xFF & ((t[0] >> 0) | (t[1] << 5));
r[i * 5 + 1] = 0xFF & ((t[1] >> 3) | (t[2] << 2) | (t[3] << 7));
r[i * 5 + 2] = 0xFF & ((t[3] >> 1) | (t[4] << 4));
r[i * 5 + 3] = 0xFF & ((t[4] >> 4) | (t[5] << 1) | (t[6] << 6));
r[i * 5 + 4] = 0xFF & ((t[6] >> 2) | (t[7] << 3));
}
}
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation, for ML-KEM-1024.
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
const mlk_poly *a)
{
unsigned j;
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
ret = mlk_poly_compress_d11_native(r, a->coeffs);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
return;
}
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
for (j = 0; j < MLKEM_N / 8; j++)
__loop__(invariant(j <= MLKEM_N / 8))
{
unsigned k;
uint16_t t[8];
for (k = 0; k < 8; k++)
__loop__(
invariant(k <= 8)
invariant(forall(r, 0, k, t[r] < (1u << 11))))
{
t[k] = mlk_scalar_compress_d11(a->coeffs[8 * j + k]);
}
/*
* Make all implicit truncation explicit. No data is being
* truncated for the LHS's since each t[i] is 11-bit in size.
*/
r[11 * j + 0] = (t[0] >> 0) & 0xFF;
r[11 * j + 1] = (t[0] >> 8) | ((t[1] << 3) & 0xFF);
r[11 * j + 2] = (t[1] >> 5) | ((t[2] << 6) & 0xFF);
r[11 * j + 3] = (t[2] >> 2) & 0xFF;
r[11 * j + 4] = (t[2] >> 10) | ((t[3] << 1) & 0xFF);
r[11 * j + 5] = (t[3] >> 7) | ((t[4] << 4) & 0xFF);
r[11 * j + 6] = (t[4] >> 4) | ((t[5] << 7) & 0xFF);
r[11 * j + 7] = (t[5] >> 1) & 0xFF;
r[11 * j + 8] = (t[5] >> 9) | ((t[6] << 2) & 0xFF);
r[11 * j + 9] = (t[6] >> 6) | ((t[7] << 5) & 0xFF);
r[11 * j + 10] = (t[7] >> 3);
}
}
/* Reference: `poly_decompress()` in the reference implementation @[REF],
* for ML-KEM-1024. */
MLK_INTERNAL_API
void mlk_poly_decompress_d5(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
{
unsigned i;
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
int ret;
ret = mlk_poly_decompress_d5_native(r->coeffs, a);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
return;
}
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
for (i = 0; i < MLKEM_N / 8; i++)
__loop__(
invariant(i <= MLKEM_N / 8)
invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
{
unsigned j;
uint8_t t[8];
const unsigned offset = i * 5;
/*
* Explicitly truncate to avoid warning about
* implicit truncation in CBMC and unwind loop for ease
* of proof.
*/
/*
* Decompress 5 8-bit bytes (so 40 bits) into
* 8 5-bit values stored in t[]
*/
t[0] = 0x1F & (a[offset + 0] >> 0);
t[1] = 0x1F & ((a[offset + 0] >> 5) | (a[offset + 1] << 3));
t[2] = 0x1F & (a[offset + 1] >> 2);
t[3] = 0x1F & ((a[offset + 1] >> 7) | (a[offset + 2] << 1));
t[4] = 0x1F & ((a[offset + 2] >> 4) | (a[offset + 3] << 4));
t[5] = 0x1F & (a[offset + 3] >> 1);
t[6] = 0x1F & ((a[offset + 3] >> 6) | (a[offset + 4] << 2));
t[7] = 0x1F & (a[offset + 4] >> 3);
/* and copy to the correct slice in r[] */
for (j = 0; j < 8; j++)
__loop__(
invariant(j <= 8 && i <= MLKEM_N / 8)
invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
{
r->coeffs[8 * i + j] = mlk_scalar_decompress_d5(t[j]);
}
}
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation, for ML-KEM-1024. */
MLK_INTERNAL_API
void mlk_poly_decompress_d11(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
{
unsigned j;
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
int ret;
ret = mlk_poly_decompress_d11_native(r->coeffs, a);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
return;
}
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
for (j = 0; j < MLKEM_N / 8; j++)
__loop__(
invariant(j <= MLKEM_N / 8)
invariant(array_bound(r->coeffs, 0, 8 * j, 0, MLKEM_Q)))
{
unsigned k;
uint16_t t[8];
uint8_t const *base = &a[11 * j];
t[0] = 0x7FF & ((base[0] >> 0) | ((uint16_t)base[1] << 8));
t[1] = 0x7FF & ((base[1] >> 3) | ((uint16_t)base[2] << 5));
t[2] = 0x7FF & ((base[2] >> 6) | ((uint16_t)base[3] << 2) |
((uint16_t)base[4] << 10));
t[3] = 0x7FF & ((base[4] >> 1) | ((uint16_t)base[5] << 7));
t[4] = 0x7FF & ((base[5] >> 4) | ((uint16_t)base[6] << 4));
t[5] = 0x7FF & ((base[6] >> 7) | ((uint16_t)base[7] << 1) |
((uint16_t)base[8] << 9));
t[6] = 0x7FF & ((base[8] >> 2) | ((uint16_t)base[9] << 6));
t[7] = 0x7FF & ((base[9] >> 5) | ((uint16_t)base[10] << 3));
for (k = 0; k < 8; k++)
__loop__(
invariant(k <= 8)
invariant(array_bound(r->coeffs, 0, 8 * j + k, 0, MLKEM_Q)))
{
r->coeffs[8 * j + k] = mlk_scalar_decompress_d11(t[k]);
}
}
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
/* Reference: `poly_tobytes()` in the reference implementation @[REF].
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
{
unsigned i;
#if defined(MLK_USE_NATIVE_POLY_TOBYTES)
int ret;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
ret = mlk_poly_tobytes_native(r, a->coeffs);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
return;
}
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
const uint16_t t0 = a->coeffs[2 * i];
const uint16_t t1 = a->coeffs[2 * i + 1];
/*
* t0 and t1 are both < MLKEM_Q, so contain at most 12 bits each of
* significant data, so these can be packed into 24 bits or exactly
* 3 bytes, as follows.
*/
/* Least significant bits 0 - 7 of t0. */
r[3 * i + 0] = t0 & 0xFF;
/*
* Most significant bits 8 - 11 of t0 become the least significant
* nibble of the second byte. The least significant 4 bits
* of t1 become the upper nibble of the second byte.
*/
r[3 * i + 1] = (t0 >> 8) | ((t1 << 4) & 0xF0);
/* Bits 4 - 11 of t1 become the third byte. */
r[3 * i + 2] = t1 >> 4;
}
}
/* Reference: `poly_frombytes()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
{
unsigned i;
#if defined(MLK_USE_NATIVE_POLY_FROMBYTES)
int ret;
ret = mlk_poly_frombytes_native(r->coeffs, a);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
return;
}
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(
invariant(i <= MLKEM_N / 2)
invariant(array_bound(r->coeffs, 0, 2 * i, 0, MLKEM_UINT12_LIMIT)))
{
const uint8_t t0 = a[3 * i + 0];
const uint8_t t1 = a[3 * i + 1];
const uint8_t t2 = a[3 * i + 2];
r->coeffs[2 * i + 0] = t0 | ((t1 << 8) & 0xFFF);
r->coeffs[2 * i + 1] = (t1 >> 4) | (t2 << 4);
}
/* Note that the coefficients are not canonical */
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
/* Reference: `poly_frommsg()` in the reference implementation @[REF].
* - We use a value barrier around the bit-selection mask to
* reduce the risk of compiler-introduced branches.
* The reference implementation contains the expression
* `(msg[i] >> j) & 1` which the compiler can reason must
* be either 0 or 1. */
MLK_INTERNAL_API
void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
{
unsigned i;
#if (MLKEM_INDCPA_MSGBYTES != MLKEM_N / 8)
#error "MLKEM_INDCPA_MSGBYTES must be equal to MLKEM_N/8 bytes!"
#endif
for (i = 0; i < MLKEM_N / 8; i++)
__loop__(
invariant(i <= MLKEM_N / 8)
invariant(array_bound(r->coeffs, 0, 8 * i, 0, MLKEM_Q)))
{
unsigned j;
for (j = 0; j < 8; j++)
__loop__(
invariant(i < MLKEM_N / 8 && j <= 8)
invariant(array_bound(r->coeffs, 0, 8 * i + j, 0, MLKEM_Q)))
{
/* mlk_ct_sel_int16(MLKEM_Q_HALF, 0, b) is `Decompress_1(b != 0)`
* as per @[FIPS203, Eq (4.8)]. */
/* Prevent the compiler from recognizing this as a bit selection */
uint8_t mask = mlk_value_barrier_u8(1u << j);
r->coeffs[8 * i + j] = mlk_ct_sel_int16(MLKEM_Q_HALF, 0, msg[i] & mask);
}
}
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
}
/* Reference: `poly_tomsg()` in the reference implementation @[REF].
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1).
*/
MLK_INTERNAL_API
void mlk_poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const mlk_poly *a)
{
unsigned i;
mlk_assert_bound(a, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_N / 8; i++)
__loop__(invariant(i <= MLKEM_N / 8))
{
unsigned j;
msg[i] = 0;
for (j = 0; j < 8; j++)
__loop__(
invariant(i <= MLKEM_N / 8 && j <= 8))
{
uint32_t t = mlk_scalar_compress_d1(a->coeffs[8 * i + j]);
msg[i] |= t << j;
}
}
}
#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(compress)
#endif /* MLK_CONFIG_MULTILEVEL_NO_SHARED */

View File

@@ -0,0 +1,666 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*
* - [REF]
* CRYSTALS-Kyber C reference implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
#ifndef MLK_COMPRESS_H
#define MLK_COMPRESS_H
#include <stddef.h>
#include <stdint.h>
#include "cbmc.h"
#include "common.h"
#include "debug.h"
#include "poly.h"
#include "verify.h"
/************************************************************
* Name: mlk_scalar_compress_d1
*
* Description: Computes round(u * 2 / q)
*
* Arguments: - u: Unsigned canonical modulus modulo q
* to be compressed.
*
* Specification: Compress_1 from @[FIPS203, Eq (4.7)].
*
************************************************************/
/*
* The multiplication in this routine will exceed UINT32_MAX
* and wrap around for large values of u. This is expected and required.
*/
#ifdef CBMC
#pragma CPROVER check push
#pragma CPROVER check disable "unsigned-overflow"
#endif
/* Reference: Part of poly_tomsg() in the reference implementation @[REF]. */
static MLK_INLINE uint32_t mlk_scalar_compress_d1(uint16_t u)
__contract__(
requires(u <= MLKEM_Q - 1)
ensures(return_value < 2)
ensures(return_value == (((uint32_t)u * 2 + MLKEM_Q / 2) / MLKEM_Q) % 2) )
{
/* Compute as follows:
* ```
* round(u * 2 / MLKEM_Q)
* = round(u * 2 * (2^31 / MLKEM_Q) / 2^31)
* ~= round(u * 2 * round(2^31 / MLKEM_Q) / 2^31)
* ```
*/
/* check-magic: 1290168 == 2*round(2^31 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290168;
return (d0 + (1u << 30)) >> 31;
}
#ifdef CBMC
#pragma CPROVER check pop
#endif
/************************************************************
* Name: mlk_scalar_compress_d4
*
* Description: Computes round(u * 16 / q) % 16
*
* Arguments: - u: Unsigned canonical modulus modulo q
* to be compressed.
*
* Specification: Compress_4 from @[FIPS203, Eq (4.7)].
*
************************************************************/
/*
* The multiplication in this routine will exceed UINT32_MAX
* and wrap around for large values of u. This is expected and required.
*/
#ifdef CBMC
#pragma CPROVER check push
#pragma CPROVER check disable "unsigned-overflow"
#endif
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
static MLK_INLINE uint32_t mlk_scalar_compress_d4(uint16_t u)
__contract__(
requires(u <= MLKEM_Q - 1)
ensures(return_value < 16)
ensures(return_value == (((uint32_t)u * 16 + MLKEM_Q / 2) / MLKEM_Q) % 16))
{
/* Compute as follows:
* ```
* round(u * 16 / MLKEM_Q)
* = round(u * 16 * (2^28 / MLKEM_Q) / 2^28)
* ~= round(u * 16 * round(2^28 / MLKEM_Q) / 2^28)
* ```
*/
/* check-magic: 1290160 == 16 * round(2^28 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290160;
return (d0 + (1u << 27)) >> 28; /* round(d0/2^28) */
}
#ifdef CBMC
#pragma CPROVER check pop
#endif
/************************************************************
* Name: mlk_scalar_decompress_d4
*
* Description: Computes round(u * q / 16)
*
* Arguments: - u: Unsigned canonical modulus modulo 16
* to be decompressed.
*
* Specification: Decompress_4 from @[FIPS203, Eq (4.8)].
*
************************************************************/
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
static MLK_INLINE uint16_t mlk_scalar_decompress_d4(uint32_t u)
__contract__(
requires(0 <= u && u < 16)
ensures(return_value <= (MLKEM_Q - 1))
) { return ((u * MLKEM_Q) + 8) >> 4; }
/************************************************************
* Name: mlk_scalar_compress_d5
*
* Description: Computes round(u * 32 / q) % 32
*
* Arguments: - u: Unsigned canonical modulus modulo q
* to be compressed.
*
* Specification: Compress_5 from @[FIPS203, Eq (4.7)].
*
************************************************************/
/*
* The multiplication in this routine will exceed UINT32_MAX
* and wrap around for large values of u. This is expected and required.
*/
#ifdef CBMC
#pragma CPROVER check push
#pragma CPROVER check disable "unsigned-overflow"
#endif
/* Reference: Embedded into `poly_compress()` in the
* reference implementation @[REF]. */
static MLK_INLINE uint32_t mlk_scalar_compress_d5(uint16_t u)
__contract__(
requires(u <= MLKEM_Q - 1)
ensures(return_value < 32)
ensures(return_value == (((uint32_t)u * 32 + MLKEM_Q / 2) / MLKEM_Q) % 32) )
{
/* Compute as follows:
* ```
* round(u * 32 / MLKEM_Q)
* = round(u * 32 * (2^27 / MLKEM_Q) / 2^27)
* ~= round(u * 32 * round(2^27 / MLKEM_Q) / 2^27)
* ```
*/
/* check-magic: 1290176 == 2^5 * round(2^27 / MLKEM_Q) */
uint32_t d0 = (uint32_t)u * 1290176;
return (d0 + (1u << 26)) >> 27; /* round(d0/2^27) */
}
#ifdef CBMC
#pragma CPROVER check pop
#endif
/************************************************************
* Name: mlk_scalar_decompress_d5
*
* Description: Computes round(u * q / 32)
*
* Arguments: - u: Unsigned canonical modulus modulo 32
* to be decompressed.
*
* Specification: Decompress_5 from @[FIPS203, Eq (4.8)].
*
************************************************************/
/* Reference: Embedded into `poly_decompress()` in the
* reference implementation @[REF]. */
static MLK_INLINE uint16_t mlk_scalar_decompress_d5(uint32_t u)
__contract__(
requires(0 <= u && u < 32)
ensures(return_value <= MLKEM_Q - 1)
) { return ((u * MLKEM_Q) + 16) >> 5; }
/************************************************************
* Name: mlk_scalar_compress_d10
*
* Description: Computes round(u * 2**10 / q) % 2**10
*
* Arguments: - u: Unsigned canonical modulus modulo q
* to be compressed.
*
* Specification: Compress_10 from @[FIPS203, Eq (4.7)].
*
************************************************************/
/*
* The multiplication in this routine will exceed UINT32_MAX
* and wrap around for large values of u. This is expected and required.
*/
#ifdef CBMC
#pragma CPROVER check push
#pragma CPROVER check disable "unsigned-overflow"
#endif
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
static MLK_INLINE uint32_t mlk_scalar_compress_d10(uint16_t u)
__contract__(
requires(u <= MLKEM_Q - 1)
ensures(return_value < (1u << 10))
ensures(return_value == (((uint32_t)u * (1u << 10) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 10)))
{
/* Compute as follows:
* ```
* round(u * 1024 / MLKEM_Q)
* = round(u * 1024 * (2^33 / MLKEM_Q) / 2^33)
* ~= round(u * 1024 * round(2^33 / MLKEM_Q) / 2^33)
* ```
*/
/* check-magic: 2642263040 == 2^10 * round(2^33 / MLKEM_Q) */
uint64_t d0 = (uint64_t)u * 2642263040;
d0 = (d0 + ((uint64_t)1u << 32)) >> 33; /* round(d0/2^33) */
return (d0 & 0x3FF);
}
#ifdef CBMC
#pragma CPROVER check pop
#endif
/************************************************************
* Name: mlk_scalar_decompress_d10
*
* Description: Computes round(u * q / 1024)
*
* Arguments: - u: Unsigned canonical modulus modulo 1024
* to be decompressed.
*
* Specification: Decompress_10 from @[FIPS203, Eq (4.8)].
*
************************************************************/
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
static MLK_INLINE uint16_t mlk_scalar_decompress_d10(uint32_t u)
__contract__(
requires(0 <= u && u < 1024)
ensures(return_value <= (MLKEM_Q - 1))
) { return ((u * MLKEM_Q) + 512) >> 10; }
/************************************************************
* Name: mlk_scalar_compress_d11
*
* Description: Computes round(u * 2**11 / q) % 2**11
*
* Arguments: - u: Unsigned canonical modulus modulo q
* to be compressed.
*
* Specification: Compress_11 from @[FIPS203, Eq (4.7)].
*
************************************************************/
/*
* The multiplication in this routine will exceed UINT32_MAX
* and wrap around for large values of u. This is expected and required.
*/
#ifdef CBMC
#pragma CPROVER check push
#pragma CPROVER check disable "unsigned-overflow"
#endif
/* Reference: Embedded into `polyvec_compress()` in the
* reference implementation @[REF]. */
static MLK_INLINE uint32_t mlk_scalar_compress_d11(uint16_t u)
__contract__(
requires(u <= MLKEM_Q - 1)
ensures(return_value < (1u << 11))
ensures(return_value == (((uint32_t)u * (1u << 11) + MLKEM_Q / 2) / MLKEM_Q) % (1 << 11)))
{
/* Compute as follows:
* ```
* round(u * 2048 / MLKEM_Q)
* = round(u * 2048 * (2^33 / MLKEM_Q) / 2^33)
* ~= round(u * 2048 * round(2^33 / MLKEM_Q) / 2^33)
* ```
*/
/* check-magic: 5284526080 == 2^11 * round(2^33 / MLKEM_Q) */
uint64_t d0 = (uint64_t)u * 5284526080;
d0 = (d0 + ((uint64_t)1u << 32)) >> 33; /* round(d0/2^33) */
return (d0 & 0x7FF);
}
#ifdef CBMC
#pragma CPROVER check pop
#endif
/************************************************************
* Name: mlk_scalar_decompress_d11
*
* Description: Computes round(u * q / 2048)
*
* Arguments: - u: Unsigned canonical modulus modulo 2048
* to be decompressed.
*
* Specification: Decompress_11 from @[FIPS203, Eq (4.8)].
*
************************************************************/
/* Reference: Embedded into `polyvec_decompress()` in the
* reference implementation @[REF]. */
static MLK_INLINE uint16_t mlk_scalar_decompress_d11(uint32_t u)
__contract__(
requires(0 <= u && u < 2048)
ensures(return_value <= (MLKEM_Q - 1))
) { return ((u * MLKEM_Q) + 1024) >> 11; }
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
#define mlk_poly_compress_d4 MLK_NAMESPACE(poly_compress_d4)
/*************************************************
* Name: mlk_poly_compress_d4
*
* Description: Compression (4 bits) and subsequent serialization of a
* polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
* - const mlk_poly *a: pointer to input polynomial
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
*
* Specification: Implements `ByteEncode_4 (Compress_4 (a))`:
* - ByteEncode_d: @[FIPS203, Algorithm 5],
* - Compress_d: @[FIPS203, Eq (4.7)]
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* - `ByteEncode_{d_v} (Compress_{d_v} (v))` appears in
* @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L23],
* where `d_v=4` for ML-KEM-{512,768} @[FIPS203, Table 2].
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_compress_d4(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
const mlk_poly *a);
#define mlk_poly_compress_d10 MLK_NAMESPACE(poly_compress_d10)
/*************************************************
* Name: mlk_poly_compress_d10
*
* Description: Compression (10 bits) and subsequent serialization of a
* polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
* - const mlk_poly *a: pointer to input polynomial
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
*
* Specification: Implements `ByteEncode_10 (Compress_10 (a))`:
* - ByteEncode_d: @[FIPS203, Algorithm 5],
* - Compress_d: @[FIPS203, Eq (4.7)]
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* - `ByteEncode_{d_u} (Compress_{d_u} (u))` appears in
* @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22],
* where `d_u=10` for ML-KEM-{512,768} @[FIPS203, Table 2].
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_compress_d10(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
const mlk_poly *a);
#define mlk_poly_decompress_d4 MLK_NAMESPACE(poly_decompress_d4)
/*************************************************
* Name: mlk_poly_decompress_d4
*
* Description: De-serialization and subsequent decompression (dv bits) of a
* polynomial; approximate inverse of poly_compress
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
*
* Upon return, the coefficients of the output polynomial are unsigned-canonical
* (non-negative and smaller than MLKEM_Q).
*
* Specification: Implements `Decompress_4 (ByteDecode_4 (a))`:
* - ByteDecode_d: @[FIPS203, Algorithm 6],
* - Decompress_d: @[FIPS203, Eq (4.8)]
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* - `Decompress_{d_v} (ByteDecode_{d_v} (v))` appears in
* @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L4],
* where `d_v=4` for ML-KEM-{512,768} @[FIPS203, Table 2].
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_decompress_d4(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
#define mlk_poly_decompress_d10 MLK_NAMESPACE(poly_decompress_d10)
/*************************************************
* Name: mlk_poly_decompress_d10
*
* Description: De-serialization and subsequent decompression (10 bits) of a
* polynomial; approximate inverse of mlk_poly_compress_d10
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
*
* Upon return, the coefficients of the output polynomial are unsigned-canonical
* (non-negative and smaller than MLKEM_Q).
*
* Specification: Implements `Decompress_10 (ByteDecode_10 (a))`:
* - ByteDecode_d: @[FIPS203, Algorithm 6],
* - Decompress_d: @[FIPS203, Eq (4.8)]
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* - `Decompress_{d_u} (ByteDecode_{d_u} (u))` appears in
* @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L3],
* where `d_u=10` for ML-KEM-{512,768} @[FIPS203, Table 2].
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_decompress_d10(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
#define mlk_poly_compress_d5 MLK_NAMESPACE(poly_compress_d5)
/*************************************************
* Name: mlk_poly_compress_d5
*
* Description: Compression (5 bits) and subsequent serialization of a
* polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
* - const mlk_poly *a: pointer to input polynomial
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
*
* Specification: Implements `ByteEncode_5 (Compress_5 (a))`:
* - ByteEncode_d: @[FIPS203, Algorithm 5],
* - Compress_d: @[FIPS203, Eq (4.7)]
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* - `ByteEncode_{d_v} (Compress_{d_v} (v))` appears in
* @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L23],
* where `d_v=5` for ML-KEM-1024 @[FIPS203, Table 2].
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_compress_d5(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
const mlk_poly *a);
#define mlk_poly_compress_d11 MLK_NAMESPACE(poly_compress_d11)
/*************************************************
* Name: mlk_poly_compress_d11
*
* Description: Compression (11 bits) and subsequent serialization of a
* polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
* - const mlk_poly *a: pointer to input polynomial
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
*
* Specification: `ByteEncode_11 (Compress_11 (a))`:
* - ByteEncode_d: @[FIPS203, Algorithm 5],
* - Compress_d: @[FIPS203, Eq (4.7)]
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* - `ByteEncode_{d_u} (Compress_{d_u} (u))` appears in
* @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22],
* where `d_u=11` for ML-KEM-1024 @[FIPS203, Table 2].
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_compress_d11(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
const mlk_poly *a);
#define mlk_poly_decompress_d5 MLK_NAMESPACE(poly_decompress_d5)
/*************************************************
* Name: mlk_poly_decompress_d5
*
* Description: De-serialization and subsequent decompression (dv bits) of a
* polynomial; approximate inverse of poly_compress
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
*
* Upon return, the coefficients of the output polynomial are unsigned-canonical
* (non-negative and smaller than MLKEM_Q).
*
* Specification: Implements `Decompress_5 (ByteDecode_5 (a))`:
* - ByteDecode_d: @[FIPS203, Algorithm 6],
* - Decompress_d: @[FIPS203, Eq (4.8)]
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* - `Decompress_{d_v} (ByteDecode_{d_v} (v))` appears in
* @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L4],
* where `d_v=5` for ML-KEM-1024 @[FIPS203, Table 2].
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_decompress_d5(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
#define mlk_poly_decompress_d11 MLK_NAMESPACE(poly_decompress_d11)
/*************************************************
* Name: mlk_poly_decompress_d11
*
* Description: De-serialization and subsequent decompression (11 bits) of a
* polynomial; approximate inverse of mlk_poly_compress_d11
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
*
* Upon return, the coefficients of the output polynomial are unsigned-canonical
* (non-negative and smaller than MLKEM_Q).
*
* Specification: Implements `Decompress_11 (ByteDecode_11 (a))`:
* - ByteDecode_d: @[FIPS203, Algorithm 6],
* - Decompress_d: @[FIPS203, Eq (4.8)]
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* - `Decompress_{d_u} (ByteDecode_{d_u} (u))` appears in
* @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L3],
* where `d_u=11` for ML-KEM-1024 @[FIPS203, Table 2].
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_decompress_d11(mlk_poly *r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
#define mlk_poly_tobytes MLK_NAMESPACE(poly_tobytes)
/*************************************************
* Name: mlk_poly_tobytes
*
* Description: Serialization of a polynomial.
* Signed coefficients are converted to
* unsigned form before serialization.
*
* Arguments: INPUT:
* - a: const pointer to input polynomial,
* with each coefficient in the range [0,1,..,Q-1]
* OUTPUT
* - r: pointer to output byte array
* (of MLKEM_POLYBYTES bytes)
*
* Specification: Implements ByteEncode_12 @[FIPS203, Algorithm 5].
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_tobytes(uint8_t r[MLKEM_POLYBYTES], const mlk_poly *a)
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
assigns(object_whole(r))
);
#define mlk_poly_frombytes MLK_NAMESPACE(poly_frombytes)
/*************************************************
* Name: mlk_poly_frombytes
*
* Description: De-serialization of a polynomial.
*
* Arguments: INPUT
* - a: pointer to input byte array
* (of MLKEM_POLYBYTES bytes)
* OUTPUT
* - r: pointer to output polynomial, with
* each coefficient unsigned and in the range
* 0 .. 4095
*
* Specification: Implements ByteDecode_12 @[FIPS203, Algorithm 6].
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_frombytes(mlk_poly *r, const uint8_t a[MLKEM_POLYBYTES])
__contract__(
requires(memory_no_alias(a, MLKEM_POLYBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
);
#define mlk_poly_frommsg MLK_NAMESPACE(poly_frommsg)
/*************************************************
* Name: mlk_poly_frommsg
*
* Description: Convert 32-byte message to polynomial
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *msg: pointer to input message
*
* Specification: Implements `Decompress_1 (ByteDecode_1 (a))`:
* - ByteDecode_d: @[FIPS203, Algorithm 6],
* - Decompress_d: @[FIPS203, Eq (4.8)]
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* - `Decompress_1 (ByteDecode_1 (w))` appears in
* @[FIPS203, Algorithm 15 (K-PKE.Encrypt), L20].
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_frommsg(mlk_poly *r, const uint8_t msg[MLKEM_INDCPA_MSGBYTES])
__contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
assigns(object_whole(r))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
);
#define mlk_poly_tomsg MLK_NAMESPACE(poly_tomsg)
/*************************************************
* Name: mlk_poly_tomsg
*
* Description: Convert polynomial to 32-byte message
*
* Arguments: - uint8_t *msg: pointer to output message
* - const mlk_poly *r: pointer to input polynomial
* Coefficients must be unsigned canonical
*
* Specification: Implements `ByteEncode_1 (Compress_1 (a))`:
* - ByteEncode_d: @[FIPS203, Algorithm 5],
* - Compress_d: @[FIPS203, Eq (4.7)]
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* - `ByteEncode_1 (Compress_1 (w))` appears in
* @[FIPS203, Algorithm 14 (K-PKE.Decrypt), L7].
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_tomsg(uint8_t msg[MLKEM_INDCPA_MSGBYTES], const mlk_poly *r)
__contract__(
requires(memory_no_alias(msg, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
assigns(object_whole(msg))
);
#endif /* !MLK_COMPRESS_H */

View File

@@ -0,0 +1,64 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* NOTE: You can remove this file unless you compile with MLKEM_DEBUG. */
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED) && defined(MLKEM_DEBUG)
#include <stdio.h>
#include <stdlib.h>
#include "debug.h"
#define MLK_DEBUG_ERROR_HEADER "[ERROR:%s:%04d] "
void mlk_debug_check_assert(const char *file, int line, const int val)
{
if (val == 0)
{
fprintf(stderr, MLK_DEBUG_ERROR_HEADER "Assertion failed (value %d)\n",
file, line, val);
exit(1);
}
}
void mlk_debug_check_bounds(const char *file, int line, const int16_t *ptr,
unsigned len, int lower_bound_exclusive,
int upper_bound_exclusive)
{
int err = 0;
unsigned i;
for (i = 0; i < len; i++)
{
int16_t val = ptr[i];
if (!(val > lower_bound_exclusive && val < upper_bound_exclusive))
{
fprintf(
stderr,
MLK_DEBUG_ERROR_HEADER
"Bounds assertion failed: Index %u, value %d out of bounds (%d,%d)\n",
file, line, i, (int)val, lower_bound_exclusive,
upper_bound_exclusive);
err = 1;
}
}
if (err == 1)
{
exit(1);
}
}
#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED && MLKEM_DEBUG */
MLK_EMPTY_CU(debug)
#endif /* !(!MLK_CONFIG_MULTILEVEL_NO_SHARED && MLKEM_DEBUG) */
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef MLK_DEBUG_ERROR_HEADER

View File

@@ -0,0 +1,129 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#ifndef MLK_DEBUG_H
#define MLK_DEBUG_H
#include "common.h"
#if defined(MLKEM_DEBUG)
#include <stdint.h>
/*************************************************
* Name: mlk_assert
*
* Description: Check debug assertion
*
* Prints an error message to stderr and calls
* exit(1) if not.
*
* Arguments: - file: filename
* - line: line number
* - val: Value asserted to be non-zero
**************************************************/
#define mlk_debug_check_assert MLK_NAMESPACE(mlkem_debug_assert)
void mlk_debug_check_assert(const char *file, int line, const int val);
/*************************************************
* Name: mlk_debug_check_bounds
*
* Description: Check whether values in an array of int16_t
* are within specified bounds.
*
* Prints an error message to stderr and calls
* exit(1) if not.
*
* Arguments: - file: filename
* - line: line number
* - ptr: Base of array to be checked
* - len: Number of int16_t in ptr
* - lower_bound_exclusive: Exclusive lower bound
* - upper_bound_exclusive: Exclusive upper bound
**************************************************/
#define mlk_debug_check_bounds MLK_NAMESPACE(mlkem_debug_check_bounds)
void mlk_debug_check_bounds(const char *file, int line, const int16_t *ptr,
unsigned len, int lower_bound_exclusive,
int upper_bound_exclusive);
/* Check assertion, calling exit() upon failure
*
* val: Value that's asserted to be non-zero
*/
#define mlk_assert(val) mlk_debug_check_assert(__FILE__, __LINE__, (val))
/* Check bounds in array of int16_t's
* ptr: Base of int16_t array; will be explicitly cast to int16_t*,
* so you may pass a byte-compatible type such as mlk_poly or mlk_polyvec.
* len: Number of int16_t in array
* value_lb: Inclusive lower value bound
* value_ub: Exclusive upper value bound */
#define mlk_assert_bound(ptr, len, value_lb, value_ub) \
mlk_debug_check_bounds(__FILE__, __LINE__, (const int16_t *)(ptr), (len), \
(value_lb) - 1, (value_ub))
/* Check absolute bounds in array of int16_t's
* ptr: Base of array, expression of type int16_t*
* len: Number of int16_t in array
* value_abs_bd: Exclusive absolute upper bound */
#define mlk_assert_abs_bound(ptr, len, value_abs_bd) \
mlk_assert_bound((ptr), (len), (-(value_abs_bd) + 1), (value_abs_bd))
/* Version of bounds assertions for 2-dimensional arrays */
#define mlk_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
mlk_assert_bound((ptr), ((len0) * (len1)), (value_lb), (value_ub))
#define mlk_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
mlk_assert_abs_bound((ptr), ((len0) * (len1)), (value_abs_bd))
/* When running CBMC, convert debug assertions into proof obligations */
#elif defined(CBMC)
#include "cbmc.h"
#define mlk_assert(val) cassert(val)
#define mlk_assert_bound(ptr, len, value_lb, value_ub) \
cassert(array_bound(((int16_t *)(ptr)), 0, (len), (value_lb), (value_ub)))
#define mlk_assert_abs_bound(ptr, len, value_abs_bd) \
cassert(array_abs_bound(((int16_t *)(ptr)), 0, (len), (value_abs_bd)))
/* Because of https://github.com/diffblue/cbmc/issues/8570, we can't
* just use a single flattened array_bound(...) here. */
#define mlk_assert_bound_2d(ptr, M, N, value_lb, value_ub) \
cassert(forall(kN, 0, (M), \
array_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
(value_lb), (value_ub))))
#define mlk_assert_abs_bound_2d(ptr, M, N, value_abs_bd) \
cassert(forall(kN, 0, (M), \
array_abs_bound(&((int16_t(*)[(N)])(ptr))[kN][0], 0, (N), \
(value_abs_bd))))
#else /* !MLKEM_DEBUG && CBMC */
#define mlk_assert(val) \
do \
{ \
} while (0)
#define mlk_assert_bound(ptr, len, value_lb, value_ub) \
do \
{ \
} while (0)
#define mlk_assert_abs_bound(ptr, len, value_abs_bd) \
do \
{ \
} while (0)
#define mlk_assert_bound_2d(ptr, len0, len1, value_lb, value_ub) \
do \
{ \
} while (0)
#define mlk_assert_abs_bound_2d(ptr, len0, len1, value_abs_bd) \
do \
{ \
} while (0)
#endif /* !MLKEM_DEBUG && !CBMC */
#endif /* !MLK_DEBUG_H */

View File

@@ -0,0 +1,527 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*
* - [REF]
* CRYSTALS-Kyber C reference implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "cbmc.h"
#include "debug.h"
#include "indcpa.h"
#include "poly.h"
#include "poly_k.h"
#include "randombytes.h"
#include "sampling.h"
#include "symmetric.h"
/* Parameter set namespacing
* This is to facilitate building multiple instances
* of mlkem-native (e.g. with varying parameter sets)
* within a single compilation unit. */
#define mlk_pack_pk MLK_ADD_PARAM_SET(mlk_pack_pk)
#define mlk_unpack_pk MLK_ADD_PARAM_SET(mlk_unpack_pk)
#define mlk_pack_sk MLK_ADD_PARAM_SET(mlk_pack_sk)
#define mlk_unpack_sk MLK_ADD_PARAM_SET(mlk_unpack_sk)
#define mlk_pack_ciphertext MLK_ADD_PARAM_SET(mlk_pack_ciphertext)
#define mlk_unpack_ciphertext MLK_ADD_PARAM_SET(mlk_unpack_ciphertext)
#define mlk_matvec_mul MLK_ADD_PARAM_SET(mlk_matvec_mul)
/* End of parameter set namespacing */
/*************************************************
* Name: mlk_pack_pk
*
* Description: Serialize the public key as concatenation of the
* serialized vector of polynomials pk
* and the public seed used to generate the matrix A.
*
* Arguments: uint8_t *r: pointer to the output serialized public key
* mlk_polyvec pk: pointer to the input public-key mlk_polyvec.
* Must have coefficients within [0,..,q-1].
* const uint8_t *seed: pointer to the input public seed
*
* Specification:
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L19]
*
**************************************************/
static void mlk_pack_pk(uint8_t r[MLKEM_INDCPA_PUBLICKEYBYTES], mlk_polyvec pk,
const uint8_t seed[MLKEM_SYMBYTES])
{
mlk_assert_bound_2d(pk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, pk);
mlk_memcpy(r + MLKEM_POLYVECBYTES, seed, MLKEM_SYMBYTES);
}
/*************************************************
* Name: mlk_unpack_pk
*
* Description: De-serialize public key from a byte array;
* approximate inverse of mlk_pack_pk
*
* Arguments: - mlk_polyvec pk: pointer to output public-key polynomial
* vector Coefficients will be normalized to [0,..,q-1].
* - uint8_t *seed: pointer to output seed to generate matrix A
* - const uint8_t *packedpk: pointer to input serialized public
* key.
*
* Specification:
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L2-3]
*
**************************************************/
static void mlk_unpack_pk(mlk_polyvec pk, uint8_t seed[MLKEM_SYMBYTES],
const uint8_t packedpk[MLKEM_INDCPA_PUBLICKEYBYTES])
{
mlk_polyvec_frombytes(pk, packedpk);
mlk_memcpy(seed, packedpk + MLKEM_POLYVECBYTES, MLKEM_SYMBYTES);
/* NOTE: If a modulus check was conducted on the PK, we know at this
* point that the coefficients of `pk` are unsigned canonical. The
* specifications and proofs, however, do _not_ assume this, and instead
* work with the easily provable bound by MLKEM_UINT12_LIMIT. */
}
/*************************************************
* Name: mlk_pack_sk
*
* Description: Serialize the secret key
*
* Arguments: - uint8_t *r: pointer to output serialized secret key
* - mlk_polyvec sk: pointer to input vector of polynomials
* (secret key)
*
* Specification:
* Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L20]
*
**************************************************/
static void mlk_pack_sk(uint8_t r[MLKEM_INDCPA_SECRETKEYBYTES], mlk_polyvec sk)
{
mlk_assert_bound_2d(sk, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
mlk_polyvec_tobytes(r, sk);
}
/*************************************************
* Name: mlk_unpack_sk
*
* Description: De-serialize the secret key; inverse of mlk_pack_sk
*
* Arguments: - mlk_polyvec sk: pointer to output vector of polynomials
* (secret key)
* - const uint8_t *packedsk: pointer to input serialized secret
* key
*
* Specification:
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L5]
*
**************************************************/
static void mlk_unpack_sk(mlk_polyvec sk,
const uint8_t packedsk[MLKEM_INDCPA_SECRETKEYBYTES])
{
mlk_polyvec_frombytes(sk, packedsk);
}
/*************************************************
* Name: mlk_pack_ciphertext
*
* Description: Serialize the ciphertext as concatenation of the
* compressed and serialized vector of polynomials b
* and the compressed and serialized polynomial v
*
* Arguments: uint8_t *r: pointer to the output serialized ciphertext
* mlk_poly *pk: pointer to the input vector of polynomials b
* mlk_poly *v: pointer to the input polynomial v
*
* Specification:
* Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22-23]
*
**************************************************/
static void mlk_pack_ciphertext(uint8_t r[MLKEM_INDCPA_BYTES], mlk_polyvec b,
mlk_poly *v)
{
mlk_polyvec_compress_du(r, b);
mlk_poly_compress_dv(r + MLKEM_POLYVECCOMPRESSEDBYTES_DU, v);
}
/*************************************************
* Name: mlk_unpack_ciphertext
*
* Description: De-serialize and decompress ciphertext from a byte array;
* approximate inverse of mlk_pack_ciphertext
*
* Arguments: - mlk_polyvec b: pointer to the output vector of polynomials b
* - mlk_poly *v: pointer to the output polynomial v
* - const uint8_t *c: pointer to the input serialized ciphertext
*
* Specification:
* Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L1-4]
*
**************************************************/
static void mlk_unpack_ciphertext(mlk_polyvec b, mlk_poly *v,
const uint8_t c[MLKEM_INDCPA_BYTES])
{
mlk_polyvec_decompress_du(b, c);
mlk_poly_decompress_dv(v, c + MLKEM_POLYVECCOMPRESSEDBYTES_DU);
}
#if !defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
/* This namespacing is not done at the top to avoid a naming conflict
* with native backends, which are currently not yet namespaced. */
#define mlk_poly_permute_bitrev_to_custom \
MLK_ADD_PARAM_SET(mlk_poly_permute_bitrev_to_custom)
static MLK_INLINE void mlk_poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
__contract__(
/* We don't specify that this should be a permutation, but only
* that it does not change the bound established at the end of mlk_gen_matrix. */
requires(memory_no_alias(data, sizeof(int16_t) * MLKEM_N))
requires(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))
assigns(memory_slice(data, sizeof(mlk_poly)))
ensures(array_bound(data, 0, MLKEM_N, 0, MLKEM_Q))) { ((void)data); }
#endif /* !MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
/* Reference: `gen_matrix()` in the reference implementation @[REF].
* - We use a special subroutine to generate 4 polynomials
* at a time, to be able to leverage batched Keccak-f1600
* implementations. The reference implementation generates
* one matrix entry a time.
*
* Not static for benchmarking */
MLK_INTERNAL_API
void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
{
unsigned i, j;
/*
* We generate four separate seed arrays rather than a single one to work
* around limitations in CBMC function contracts dealing with disjoint slices
* of the same parent object.
*/
MLK_ALIGN uint8_t seed_ext[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)];
for (j = 0; j < 4; j++)
{
mlk_memcpy(seed_ext[j], seed, MLKEM_SYMBYTES);
}
/* Sample 4 matrix entries a time. */
for (i = 0; i < (MLKEM_K * MLKEM_K / 4) * 4; i += 4)
{
for (j = 0; j < 4; j++)
{
uint8_t x, y;
x = (i + j) / MLKEM_K;
y = (i + j) % MLKEM_K;
if (transposed)
{
seed_ext[j][MLKEM_SYMBYTES + 0] = x;
seed_ext[j][MLKEM_SYMBYTES + 1] = y;
}
else
{
seed_ext[j][MLKEM_SYMBYTES + 0] = y;
seed_ext[j][MLKEM_SYMBYTES + 1] = x;
}
}
mlk_poly_rej_uniform_x4(&a[i], &a[i + 1], &a[i + 2], &a[i + 3], seed_ext);
}
/* For MLKEM_K == 3, sample the last entry individually. */
if (i < MLKEM_K * MLKEM_K)
{
uint8_t x, y;
x = i / MLKEM_K;
y = i % MLKEM_K;
if (transposed)
{
seed_ext[0][MLKEM_SYMBYTES + 0] = x;
seed_ext[0][MLKEM_SYMBYTES + 1] = y;
}
else
{
seed_ext[0][MLKEM_SYMBYTES + 0] = y;
seed_ext[0][MLKEM_SYMBYTES + 1] = x;
}
mlk_poly_rej_uniform(&a[i], seed_ext[0]);
i++;
}
mlk_assert(i == MLKEM_K * MLKEM_K);
/*
* The public matrix is generated in NTT domain. If the native backend
* uses a custom order in NTT domain, permute A accordingly.
*/
for (i = 0; i < MLKEM_K * MLKEM_K; i++)
{
mlk_poly_permute_bitrev_to_custom(a[i].coeffs);
}
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(seed_ext, sizeof(seed_ext));
}
/*************************************************
* Name: mlk_matvec_mul
*
* Description: Computes matrix-vector product in NTT domain,
* via Montgomery multiplication.
*
* Arguments: - mlk_polyvec out: Pointer to output polynomial vector
* - mlk_polymat a: Input matrix. Must be in NTT domain
* and have coefficients of absolute value < 4096.
* - mlk_polyvec v: Input polynomial vector. Must be in NTT
* domain.
* - mlk_polyvec vc: Mulcache for v, computed via
* mlk_polyvec_mulcache_compute().
*
* Specification: Implements @[FIPS203, Section 2.4.7, Eq (2.12), (2.13)]
*
**************************************************/
static void mlk_matvec_mul(mlk_polyvec out, const mlk_polymat a,
const mlk_polyvec v, const mlk_polyvec_mulcache vc)
__contract__(
requires(memory_no_alias(out, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(v, sizeof(mlk_polyvec)))
requires(memory_no_alias(vc, sizeof(mlk_polyvec_mulcache)))
requires(forall(k0, 0, MLKEM_K * MLKEM_K,
array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
assigns(object_whole(out)))
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
__loop__(
assigns(i, object_whole(out))
invariant(i <= MLKEM_K))
{
mlk_polyvec_basemul_acc_montgomery_cached(&out[i], &a[MLKEM_K * i], v, vc);
}
}
/* Reference: `indcpa_keypair_derand()` in the reference implementation @[REF].
* - We use x4-batched versions of `poly_getnoise` to leverage
* batched x4-batched Keccak-f1600.
* - We use a different implementation of `gen_matrix()` which
* uses x4-batched Keccak-f1600 (see `mlk_gen_matrix()` above).
* - We use a mulcache to speed up matrix-vector multiplication.
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
const uint8_t coins[MLKEM_SYMBYTES])
{
MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
const uint8_t *publicseed = buf;
const uint8_t *noiseseed = buf + MLKEM_SYMBYTES;
mlk_polymat a;
mlk_polyvec e, pkpv, skpv;
mlk_polyvec_mulcache skpv_cache;
MLK_ALIGN uint8_t coins_with_domain_separator[MLKEM_SYMBYTES + 1];
/* Concatenate coins with MLKEM_K for domain separation of security levels */
mlk_memcpy(coins_with_domain_separator, coins, MLKEM_SYMBYTES);
coins_with_domain_separator[MLKEM_SYMBYTES] = MLKEM_K;
mlk_hash_g(buf, coins_with_domain_separator, MLKEM_SYMBYTES + 1);
/*
* Declassify the public seed.
* Required to use it in conditional-branches in rejection sampling.
* This is needed because all output of randombytes is marked as secret
* (=undefined)
*/
MLK_CT_TESTING_DECLASSIFY(publicseed, MLKEM_SYMBYTES);
mlk_gen_matrix(a, publicseed, 0 /* no transpose */);
#if MLKEM_K == 2
mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &e[0], &e[1], noiseseed, 0, 1,
2, 3);
#elif MLKEM_K == 3
/*
* Only the first three output buffers are needed.
* The laster parameter is a dummy that's overwritten later.
*/
mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2],
&pkpv[0] /* irrelevant */, noiseseed, 0, 1, 2,
0xFF /* irrelevant */);
/* Same here */
mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &pkpv[0] /* irrelevant */,
noiseseed, 3, 4, 5, 0xFF /* irrelevant */);
#elif MLKEM_K == 4
mlk_poly_getnoise_eta1_4x(&skpv[0], &skpv[1], &skpv[2], &skpv[3], noiseseed,
0, 1, 2, 3);
mlk_poly_getnoise_eta1_4x(&e[0], &e[1], &e[2], &e[3], noiseseed, 4, 5, 6, 7);
#endif
mlk_polyvec_ntt(skpv);
mlk_polyvec_ntt(e);
mlk_polyvec_mulcache_compute(skpv_cache, skpv);
mlk_matvec_mul(pkpv, a, skpv, skpv_cache);
mlk_polyvec_tomont(pkpv);
mlk_polyvec_add(pkpv, e);
mlk_polyvec_reduce(pkpv);
mlk_polyvec_reduce(skpv);
mlk_pack_sk(sk, skpv);
mlk_pack_pk(pk, pkpv, publicseed);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
mlk_zeroize(coins_with_domain_separator, sizeof(coins_with_domain_separator));
mlk_zeroize(a, sizeof(a));
mlk_zeroize(&e, sizeof(e));
mlk_zeroize(&skpv, sizeof(skpv));
mlk_zeroize(&skpv_cache, sizeof(skpv_cache));
}
/* Reference: `indcpa_enc()` in the reference implementation @[REF].
* - We use x4-batched versions of `poly_getnoise` to leverage
* batched x4-batched Keccak-f1600.
* - We use a different implementation of `gen_matrix()` which
* uses x4-batched Keccak-f1600 (see `mlk_gen_matrix()` above).
* - We use a mulcache to speed up matrix-vector multiplication.
* - We include buffer zeroization.
*/
MLK_INTERNAL_API
void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
const uint8_t m[MLKEM_INDCPA_MSGBYTES],
const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
const uint8_t coins[MLKEM_SYMBYTES])
{
MLK_ALIGN uint8_t seed[MLKEM_SYMBYTES];
mlk_polymat at;
mlk_polyvec sp, pkpv, ep, b;
mlk_poly v, k, epp;
mlk_polyvec_mulcache sp_cache;
mlk_unpack_pk(pkpv, seed, pk);
mlk_poly_frommsg(&k, m);
/*
* Declassify the public seed.
* Required to use it in conditional-branches in rejection sampling.
* This is needed because in re-encryption the publicseed originated from sk
* which is marked undefined.
*/
MLK_CT_TESTING_DECLASSIFY(seed, MLKEM_SYMBYTES);
mlk_gen_matrix(at, seed, 1 /* transpose */);
#if MLKEM_K == 2
mlk_poly_getnoise_eta1122_4x(&sp[0], &sp[1], &ep[0], &ep[1], coins, 0, 1, 2,
3);
mlk_poly_getnoise_eta2(&epp, coins, 4);
#elif MLKEM_K == 3
/*
* In this call, only the first three output buffers are needed.
* The last parameter is a dummy that's overwritten later.
*/
mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &b[0], coins, 0, 1, 2,
0xFF);
/* The fourth output buffer in this call _is_ used. */
mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &epp, coins, 3, 4, 5, 6);
#elif MLKEM_K == 4
mlk_poly_getnoise_eta1_4x(&sp[0], &sp[1], &sp[2], &sp[3], coins, 0, 1, 2, 3);
mlk_poly_getnoise_eta2_4x(&ep[0], &ep[1], &ep[2], &ep[3], coins, 4, 5, 6, 7);
mlk_poly_getnoise_eta2(&epp, coins, 8);
#endif
mlk_polyvec_ntt(sp);
mlk_polyvec_mulcache_compute(sp_cache, sp);
mlk_matvec_mul(b, at, sp, sp_cache);
mlk_polyvec_basemul_acc_montgomery_cached(&v, pkpv, sp, sp_cache);
mlk_polyvec_invntt_tomont(b);
mlk_poly_invntt_tomont(&v);
mlk_polyvec_add(b, ep);
mlk_poly_add(&v, &epp);
mlk_poly_add(&v, &k);
mlk_polyvec_reduce(b);
mlk_poly_reduce(&v);
mlk_pack_ciphertext(c, b, &v);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(seed, sizeof(seed));
mlk_zeroize(&sp, sizeof(sp));
mlk_zeroize(&sp_cache, sizeof(sp_cache));
mlk_zeroize(&b, sizeof(b));
mlk_zeroize(&v, sizeof(v));
mlk_zeroize(at, sizeof(at));
mlk_zeroize(&k, sizeof(k));
mlk_zeroize(&ep, sizeof(ep));
mlk_zeroize(&epp, sizeof(epp));
}
/* Reference: `indcpa_dec()` in the reference implementation @[REF].
* - We use a mulcache for the scalar product.
* - We include buffer zeroization. */
MLK_INTERNAL_API
void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
const uint8_t c[MLKEM_INDCPA_BYTES],
const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
{
mlk_polyvec b, skpv;
mlk_poly v, sb;
mlk_polyvec_mulcache b_cache;
mlk_unpack_ciphertext(b, &v, c);
mlk_unpack_sk(skpv, sk);
mlk_polyvec_ntt(b);
mlk_polyvec_mulcache_compute(b_cache, b);
mlk_polyvec_basemul_acc_montgomery_cached(&sb, skpv, b, b_cache);
mlk_poly_invntt_tomont(&sb);
mlk_poly_sub(&v, &sb);
mlk_poly_reduce(&v);
mlk_poly_tomsg(m, &v);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(&skpv, sizeof(skpv));
mlk_zeroize(&b, sizeof(b));
mlk_zeroize(&b_cache, sizeof(b_cache));
mlk_zeroize(&v, sizeof(v));
mlk_zeroize(&sb, sizeof(sb));
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef mlk_pack_pk
#undef mlk_unpack_pk
#undef mlk_pack_sk
#undef mlk_unpack_sk
#undef mlk_pack_ciphertext
#undef mlk_unpack_ciphertext
#undef mlk_matvec_mul
#undef mlk_poly_permute_bitrev_to_custom

View File

@@ -0,0 +1,143 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*/
#ifndef MLK_INDCPA_H
#define MLK_INDCPA_H
#include <stdint.h>
#include "cbmc.h"
#include "common.h"
#include "poly_k.h"
#define mlk_gen_matrix MLK_NAMESPACE_K(gen_matrix)
/*************************************************
* Name: mlk_gen_matrix
*
* Description: Deterministically generate matrix A (or the transpose of A)
* from a seed. Entries of the matrix are polynomials that look
* uniformly random. Performs rejection sampling on output of
* a XOF
*
* Arguments: - mlk_polymat a: pointer to output matrix A
* - const uint8_t *seed: pointer to input seed
* - int transposed: boolean deciding whether A or A^T is generated
*
* Specification: Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen), L3-7]
* and @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L4-8].
* The `transposed` parameter only affects internal presentation.
*
**************************************************/
MLK_INTERNAL_API
void mlk_gen_matrix(mlk_polymat a, const uint8_t seed[MLKEM_SYMBYTES],
int transposed)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polymat)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
requires(transposed == 0 || transposed == 1)
assigns(object_whole(a))
ensures(forall(x, 0, MLKEM_K * MLKEM_K,
array_bound(a[x].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_indcpa_keypair_derand MLK_NAMESPACE_K(indcpa_keypair_derand)
/*************************************************
* Name: mlk_indcpa_keypair_derand
*
* Description: Generates public and private key for the CPA-secure
* public-key encryption scheme underlying ML-KEM
*
* Arguments: - uint8_t *pk: pointer to output public key
* (of length MLKEM_INDCPA_PUBLICKEYBYTES bytes)
* - uint8_t *sk: pointer to output private key
* (of length MLKEM_INDCPA_SECRETKEYBYTES bytes)
* - const uint8_t *coins: pointer to input randomness
* (of length MLKEM_SYMBYTES bytes)
*
* Specification: Implements @[FIPS203, Algorithm 13 (K-PKE.KeyGen)].
*
**************************************************/
MLK_INTERNAL_API
void mlk_indcpa_keypair_derand(uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES],
const uint8_t coins[MLKEM_SYMBYTES])
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
assigns(object_whole(pk))
assigns(object_whole(sk))
);
#define mlk_indcpa_enc MLK_NAMESPACE_K(indcpa_enc)
/*************************************************
* Name: mlk_indcpa_enc
*
* Description: Encryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *c: pointer to output ciphertext
* (of length MLKEM_INDCPA_BYTES bytes)
* - const uint8_t *m: pointer to input message
* (of length MLKEM_INDCPA_MSGBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (of length MLKEM_INDCPA_PUBLICKEYBYTES)
* - const uint8_t *coins: pointer to input random coins used as
* seed (of length MLKEM_SYMBYTES) to deterministically generate
* all randomness
*
* Specification: Implements @[FIPS203, Algorithm 14 (K-PKE.Encrypt)].
*
**************************************************/
MLK_INTERNAL_API
void mlk_indcpa_enc(uint8_t c[MLKEM_INDCPA_BYTES],
const uint8_t m[MLKEM_INDCPA_MSGBYTES],
const uint8_t pk[MLKEM_INDCPA_PUBLICKEYBYTES],
const uint8_t coins[MLKEM_SYMBYTES])
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(pk, MLKEM_INDCPA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
assigns(object_whole(c))
);
#define mlk_indcpa_dec MLK_NAMESPACE_K(indcpa_dec)
/*************************************************
* Name: mlk_indcpa_dec
*
* Description: Decryption function of the CPA-secure
* public-key encryption scheme underlying Kyber.
*
* Arguments: - uint8_t *m: pointer to output decrypted message
* (of length MLKEM_INDCPA_MSGBYTES)
* - const uint8_t *c: pointer to input ciphertext
* (of length MLKEM_INDCPA_BYTES)
* - const uint8_t *sk: pointer to input secret key
* (of length MLKEM_INDCPA_SECRETKEYBYTES)
*
* Specification: Implements @[FIPS203, Algorithm 15 (K-PKE.Decrypt)].
*
**************************************************/
MLK_INTERNAL_API
void mlk_indcpa_dec(uint8_t m[MLKEM_INDCPA_MSGBYTES],
const uint8_t c[MLKEM_INDCPA_BYTES],
const uint8_t sk[MLKEM_INDCPA_SECRETKEYBYTES])
__contract__(
requires(memory_no_alias(c, MLKEM_INDCPA_BYTES))
requires(memory_no_alias(m, MLKEM_INDCPA_MSGBYTES))
requires(memory_no_alias(sk, MLKEM_INDCPA_SECRETKEYBYTES))
assigns(object_whole(m))
);
#endif /* !MLK_INDCPA_H */

View File

@@ -0,0 +1,337 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS140_3_IG]
* Implementation Guidance for FIPS 140-3 and the Cryptographic Module
* Validation Program National Institute of Standards and Technology
* https://csrc.nist.gov/projects/cryptographic-module-validation-program/fips-140-3-ig-announcements
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*
* - [REF]
* CRYSTALS-Kyber C reference implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include "indcpa.h"
#include "kem.h"
#include "randombytes.h"
#include "symmetric.h"
#include "verify.h"
/* Parameter set namespacing
* This is to facilitate building multiple instances
* of mlkem-native (e.g. with varying security levels)
* within a single compilation unit. */
#define mlk_check_pct MLK_ADD_PARAM_SET(mlk_check_pct)
/* End of parameter set namespacing */
#if defined(CBMC)
/* Redeclaration with contract needed for CBMC only */
int memcmp(const void *str1, const void *str2, size_t n)
__contract__(
requires(memory_no_alias(str1, n))
requires(memory_no_alias(str2, n))
);
#endif /* CBMC */
/* Reference: Not implemented in the reference implementation @[REF]. */
MLK_INTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
int crypto_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
{
int res;
mlk_polyvec p;
uint8_t p_reencoded[MLKEM_POLYVECBYTES];
mlk_polyvec_frombytes(p, pk);
mlk_polyvec_reduce(p);
mlk_polyvec_tobytes(p_reencoded, p);
/* We use a constant-time memcmp here to avoid having to
* declassify the PK before the PCT has succeeded. */
res = mlk_ct_memcmp(pk, p_reencoded, MLKEM_POLYVECBYTES) ? -1 : 0;
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(p_reencoded, sizeof(p_reencoded));
mlk_zeroize(&p, sizeof(p));
return res;
}
/* Reference: Not implemented in the reference implementation @[REF]. */
MLK_INTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
int crypto_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
{
int res;
MLK_ALIGN uint8_t test[MLKEM_SYMBYTES];
/*
* The parts of `sk` being hashed and compared here are public, so
* no public information is leaked through the runtime or the return value
* of this function.
*/
/* Declassify the public part of the secret key */
MLK_CT_TESTING_DECLASSIFY(sk + MLKEM_INDCPA_SECRETKEYBYTES,
MLKEM_INDCCA_PUBLICKEYBYTES);
MLK_CT_TESTING_DECLASSIFY(
sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, MLKEM_SYMBYTES);
mlk_hash_h(test, sk + MLKEM_INDCPA_SECRETKEYBYTES,
MLKEM_INDCCA_PUBLICKEYBYTES);
res = memcmp(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, test,
MLKEM_SYMBYTES)
? -1
: 0;
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(test, sizeof(test));
return res;
}
MLK_MUST_CHECK_RETURN_VALUE
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES)));
#if defined(MLK_CONFIG_KEYGEN_PCT)
/* Specification:
* Partially implements 'Pairwise Consistency Test' @[FIPS140_3_IG, p.87] and
* @[FIPS203, Section 7.1, Pairwise Consistency]. */
/* Reference: Not implemented in the reference implementation @[REF]. */
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
{
int res;
uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES];
uint8_t ss_enc[MLKEM_SSBYTES], ss_dec[MLKEM_SSBYTES];
res = crypto_kem_enc(ct, ss_enc, pk);
if (res != 0)
{
goto cleanup;
}
res = crypto_kem_dec(ss_dec, ct, sk);
if (res != 0)
{
goto cleanup;
}
#if defined(MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST)
/* Deliberately break PCT for testing purposes */
if (mlk_break_pct())
{
ss_enc[0] = ~ss_enc[0];
}
#endif /* MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST */
res = mlk_ct_memcmp(ss_enc, ss_dec, sizeof(ss_dec));
cleanup:
/* The result of the PCT is public. */
MLK_CT_TESTING_DECLASSIFY(&res, sizeof(res));
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(ct, sizeof(ct));
mlk_zeroize(ss_enc, sizeof(ss_enc));
mlk_zeroize(ss_dec, sizeof(ss_dec));
return res;
}
#else /* MLK_CONFIG_KEYGEN_PCT */
static int mlk_check_pct(uint8_t const pk[MLKEM_INDCCA_PUBLICKEYBYTES],
uint8_t const sk[MLKEM_INDCCA_SECRETKEYBYTES])
{
/* Skip PCT */
((void)pk);
((void)sk);
return 0;
}
#endif /* !MLK_CONFIG_KEYGEN_PCT */
/* Reference: `crypto_kem_keypair_derand()` in the reference implementation
* @[REF].
* - We optionally include PCT which is not present in
* the reference code. */
MLK_EXTERNAL_API
int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
const uint8_t coins[2 * MLKEM_SYMBYTES])
{
mlk_indcpa_keypair_derand(pk, sk, coins);
mlk_memcpy(sk + MLKEM_INDCPA_SECRETKEYBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_h(sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES, pk,
MLKEM_INDCCA_PUBLICKEYBYTES);
/* Value z for pseudo-random output on reject */
mlk_memcpy(sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
coins + MLKEM_SYMBYTES, MLKEM_SYMBYTES);
/* Declassify public key */
MLK_CT_TESTING_DECLASSIFY(pk, MLKEM_INDCCA_PUBLICKEYBYTES);
/* Pairwise Consistency Test (PCT) @[FIPS140_3_IG, p.87] */
if (mlk_check_pct(pk, sk))
{
return -1;
}
return 0;
}
/* Reference: `crypto_kem_keypair()` in the reference implementation @[REF]
* - We zeroize the stack buffer */
MLK_EXTERNAL_API
int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
{
int res;
MLK_ALIGN uint8_t coins[2 * MLKEM_SYMBYTES];
/* Acquire necessary randomness, and mark it as secret. */
mlk_randombytes(coins, 2 * MLKEM_SYMBYTES);
MLK_CT_TESTING_SECRET(coins, sizeof(coins));
res = crypto_kem_keypair_derand(pk, sk, coins);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(coins, sizeof(coins));
return res;
}
/* Reference: `crypto_kem_enc_derand()` in the reference implementation @[REF]
* - We include public key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
uint8_t ss[MLKEM_SSBYTES],
const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
const uint8_t coins[MLKEM_SYMBYTES])
{
MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
/* Will contain key, coins */
MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
/* Specification: Implements @[FIPS203, Section 7.2, Modulus check] */
if (crypto_kem_check_pk(pk))
{
return -1;
}
mlk_memcpy(buf, coins, MLKEM_SYMBYTES);
/* Multitarget countermeasure for coins + contributory KEM */
mlk_hash_h(buf + MLKEM_SYMBYTES, pk, MLKEM_INDCCA_PUBLICKEYBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* coins are in kr+MLKEM_SYMBYTES */
mlk_indcpa_enc(ct, buf, pk, kr + MLKEM_SYMBYTES);
mlk_memcpy(ss, kr, MLKEM_SYMBYTES);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
mlk_zeroize(kr, sizeof(kr));
return 0;
}
/* Reference: `crypto_kem_enc()` in the reference implementation @[REF]
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
uint8_t ss[MLKEM_SSBYTES],
const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
{
int res;
MLK_ALIGN uint8_t coins[MLKEM_SYMBYTES];
mlk_randombytes(coins, MLKEM_SYMBYTES);
MLK_CT_TESTING_SECRET(coins, sizeof(coins));
res = crypto_kem_enc_derand(ct, ss, pk, coins);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(coins, sizeof(coins));
return res;
}
/* Reference: `crypto_kem_dec()` in the reference implementation @[REF]
* - We include secret key check
* - We include stack buffer zeroization */
MLK_EXTERNAL_API
int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
{
uint8_t fail;
MLK_ALIGN uint8_t buf[2 * MLKEM_SYMBYTES];
/* Will contain key, coins */
MLK_ALIGN uint8_t kr[2 * MLKEM_SYMBYTES];
MLK_ALIGN uint8_t tmp[MLKEM_SYMBYTES + MLKEM_INDCCA_CIPHERTEXTBYTES];
const uint8_t *pk = sk + MLKEM_INDCPA_SECRETKEYBYTES;
/* Specification: Implements @[FIPS203, Section 7.3, Hash check] */
if (crypto_kem_check_sk(sk))
{
return -1;
}
mlk_indcpa_dec(buf, ct, sk);
/* Multitarget countermeasure for coins + contributory KEM */
mlk_memcpy(buf + MLKEM_SYMBYTES,
sk + MLKEM_INDCCA_SECRETKEYBYTES - 2 * MLKEM_SYMBYTES,
MLKEM_SYMBYTES);
mlk_hash_g(kr, buf, 2 * MLKEM_SYMBYTES);
/* Recompute and compare ciphertext */
/* coins are in kr+MLKEM_SYMBYTES */
mlk_indcpa_enc(tmp, buf, pk, kr + MLKEM_SYMBYTES);
fail = mlk_ct_memcmp(ct, tmp, MLKEM_INDCCA_CIPHERTEXTBYTES);
/* Compute rejection key */
mlk_memcpy(tmp, sk + MLKEM_INDCCA_SECRETKEYBYTES - MLKEM_SYMBYTES,
MLKEM_SYMBYTES);
mlk_memcpy(tmp + MLKEM_SYMBYTES, ct, MLKEM_INDCCA_CIPHERTEXTBYTES);
mlk_hash_j(ss, tmp, sizeof(tmp));
/* Copy true key to return buffer if fail is 0 */
mlk_ct_cmov_zero(ss, kr, MLKEM_SYMBYTES, fail);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
mlk_zeroize(kr, sizeof(kr));
mlk_zeroize(tmp, sizeof(tmp));
return 0;
}
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef mlk_check_pct

View File

@@ -0,0 +1,282 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*
* - [REF]
* CRYSTALS-Kyber C reference implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
#ifndef MLK_KEM_H
#define MLK_KEM_H
#include <stdint.h>
#include "cbmc.h"
#include "common.h"
#include "sys.h"
#if defined(MLK_CHECK_APIS)
/* Include to ensure consistency between internal kem.h
* and external mlkem_native.h. */
#define MLK_CONFIG_API_NO_SUPERCOP
#include "mlkem_native.h"
#undef MLK_CONFIG_API_NO_SUPERCOP
#if MLKEM_INDCCA_SECRETKEYBYTES != \
MLKEM_SECRETKEYBYTES(MLK_CONFIG_PARAMETER_SET)
#error Mismatch for SECRETKEYBYTES between kem.h and mlkem_native.h
#endif
#if MLKEM_INDCCA_PUBLICKEYBYTES != \
MLKEM_PUBLICKEYBYTES(MLK_CONFIG_PARAMETER_SET)
#error Mismatch for PUBLICKEYBYTES between kem.h and mlkem_native.h
#endif
#if MLKEM_INDCCA_CIPHERTEXTBYTES != \
MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_PARAMETER_SET)
#error Mismatch for CIPHERTEXTBYTES between kem.h and mlkem_native.h
#endif
#endif /* MLK_CHECK_APIS */
#define crypto_kem_keypair_derand MLK_NAMESPACE_K(keypair_derand)
#define crypto_kem_keypair MLK_NAMESPACE_K(keypair)
#define crypto_kem_enc_derand MLK_NAMESPACE_K(enc_derand)
#define crypto_kem_enc MLK_NAMESPACE_K(enc)
#define crypto_kem_dec MLK_NAMESPACE_K(dec)
#define crypto_kem_check_pk MLK_NAMESPACE_K(check_pk)
#define crypto_kem_check_sk MLK_NAMESPACE_K(check_sk)
/*************************************************
* Name: crypto_kem_check_pk
*
* Description: Implements modulus check mandated by FIPS 203,
* i.e., ensures that coefficients are in [0,q-1].
*
* Arguments: - const uint8_t *pk: pointer to input public key
* (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
* bytes)
*
* Returns: - 0 on success
* - -1 on failure
*
* Specification: Implements @[FIPS203, Section 7.2, 'modulus check']
*
**************************************************/
/* Reference: Not implemented in the reference implementation @[REF]. */
MLK_INTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
int crypto_kem_check_pk(const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES]);
/*************************************************
* Name: crypto_kem_check_sk
*
* Description: Implements public key hash check mandated by FIPS 203,
* i.e., ensures that
* sk[768𝑘+32 768𝑘+64] = H(pk)= H(sk[384𝑘 : 768𝑘+32])
*
* Arguments: - const uint8_t *sk: pointer to input private key
* (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
* bytes)
*
* Returns: - 0 on success
* - -1 on failure
*
* Specification: Implements @[FIPS203, Section 7.3, 'hash check']
*
**************************************************/
/* Reference: Not implemented in the reference implementation @[REF]. */
MLK_INTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
int crypto_kem_check_sk(const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES]);
/*************************************************
* Name: crypto_kem_keypair_derand
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
*
* Arguments: - uint8_t *pk: pointer to output public key
* (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
* bytes)
* - uint8_t *sk: pointer to output private key
* (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
* bytes)
* - uint8_t *coins: pointer to input randomness
* (an already allocated array filled with 2*MLKEM_SYMBYTES
* random bytes)
*
* Returns: - 0: On success
* - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
*
* Specification: Implements @[FIPS203, Algorithm 16, ML-KEM.KeyGen_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
int crypto_kem_keypair_derand(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES],
const uint8_t coins[2 * MLKEM_SYMBYTES])
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
requires(memory_no_alias(coins, 2 * MLKEM_SYMBYTES))
assigns(object_whole(pk))
assigns(object_whole(sk))
);
/*************************************************
* Name: crypto_kem_keypair
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
*
* Arguments: - uint8_t *pk: pointer to output public key
* (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
* bytes)
* - uint8_t *sk: pointer to output private key
* (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
* bytes)
*
* Returns: - 0: On success
* - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
*
* Specification: Implements @[FIPS203, Algorithm 19, ML-KEM.KeyGen]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
int crypto_kem_keypair(uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
__contract__(
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
assigns(object_whole(pk))
assigns(object_whole(sk))
);
/*************************************************
* Name: crypto_kem_enc_derand
*
* Description: Generates cipher text and shared
* secret for given public key
*
* Arguments: - uint8_t *ct: pointer to output cipher text
* (an already allocated array of MLKEM_INDCCA_CIPHERTEXTBYTES
* bytes)
* - uint8_t *ss: pointer to output shared secret
* (an already allocated array of MLKEM_SSBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
* bytes)
* - const uint8_t *coins: pointer to input randomness
* (an already allocated array filled with MLKEM_SYMBYTES random
* bytes)
*
* Returns: - 0 on success
* - -1 if the 'modulus check' @[FIPS203, Section 7.2]
* for the public key fails.
*
* Specification: Implements @[FIPS203, Algorithm 17, ML-KEM.Encaps_Internal]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
int crypto_kem_enc_derand(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
uint8_t ss[MLKEM_SSBYTES],
const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES],
const uint8_t coins[MLKEM_SYMBYTES])
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
requires(memory_no_alias(coins, MLKEM_SYMBYTES))
assigns(object_whole(ct))
assigns(object_whole(ss))
);
/*************************************************
* Name: crypto_kem_enc
*
* Description: Generates cipher text and shared
* secret for given public key
*
* Arguments: - uint8_t *ct: pointer to output cipher text
* (an already allocated array of MLKEM_INDCCA_CIPHERTEXTBYTES
* bytes)
* - uint8_t *ss: pointer to output shared secret
* (an already allocated array of MLKEM_SSBYTES bytes)
* - const uint8_t *pk: pointer to input public key
* (an already allocated array of MLKEM_INDCCA_PUBLICKEYBYTES
* bytes)
*
* Returns: - 0 on success
* - -1 if the 'modulus check' @[FIPS203, Section 7.2]
* for the public key fails.
*
* Specification: Implements @[FIPS203, Algorithm 20, ML-KEM.Encaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
int crypto_kem_enc(uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
uint8_t ss[MLKEM_SSBYTES],
const uint8_t pk[MLKEM_INDCCA_PUBLICKEYBYTES])
__contract__(
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(pk, MLKEM_INDCCA_PUBLICKEYBYTES))
assigns(object_whole(ct))
assigns(object_whole(ss))
);
/*************************************************
* Name: crypto_kem_dec
*
* Description: Generates shared secret for given
* cipher text and private key
*
* Arguments: - uint8_t *ss: pointer to output shared secret
* (an already allocated array of MLKEM_SSBYTES bytes)
* - const uint8_t *ct: pointer to input cipher text
* (an already allocated array of MLKEM_INDCCA_CIPHERTEXTBYTES
* bytes)
* - const uint8_t *sk: pointer to input private key
* (an already allocated array of MLKEM_INDCCA_SECRETKEYBYTES
* bytes)
*
* Returns: - 0 on success
* - -1 if the 'hash check' @[FIPS203, Section 7.3]
* for the secret key fails.
*
* Specification: Implements @[FIPS203, Algorithm 21, ML-KEM.Decaps]
*
**************************************************/
MLK_EXTERNAL_API
MLK_MUST_CHECK_RETURN_VALUE
int crypto_kem_dec(uint8_t ss[MLKEM_SSBYTES],
const uint8_t ct[MLKEM_INDCCA_CIPHERTEXTBYTES],
const uint8_t sk[MLKEM_INDCCA_SECRETKEYBYTES])
__contract__(
requires(memory_no_alias(ss, MLKEM_SSBYTES))
requires(memory_no_alias(ct, MLKEM_INDCCA_CIPHERTEXTBYTES))
requires(memory_no_alias(sk, MLKEM_INDCCA_SECRETKEYBYTES))
assigns(object_whole(ss))
);
#endif /* !MLK_KEM_H */

View File

@@ -0,0 +1,291 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*/
#ifndef MLK_H
#define MLK_H
/******************************************************************************
*
* Public API for mlkem-native
*
* This header defines the public API of a single build of mlkem-native.
*
* # Examples
*
* See [examples/basic], [examples/multilevel_build], and
* [examples/multilevel_build_native] for examples of how to use this header.
*
* # Usage
*
* To use this header, configure the following options:
*
* - MLK_CONFIG_API_PARAMETER_SET [required]
*
* The parameter set used for the build; 512, 768, or 1024.
*
* - MLK_CONFIG_API_NAMESPACE_PREFIX [required]
*
* The namespace prefix used for the build.
*
* NOTE:
* For a multi-level build, you must include the 512/768/1024 suffixes
* in MLK_CONFIG_API_NAMESPACE_PREFIX.
*
* - MLK_CONFIG_API_NO_SUPERCOP [optional]
*
* By default, this header will also expose the mlkem-native API in the
* SUPERCOP naming convention crypto_kem_xxx. If you don't want/need this,
* set MLK_CONFIG_API_NO_SUPERCOP. You must set this for a multi-level build.
*
* - MLK_CONFIG_API_CONSTANTS_ONLY [optional]
*
* If you don't want this header to expose any function declarations,
* but only constants for the sizes of key material, set
* MLK_CONFIG_API_CONSTANTS_ONLY. In this case, you don't need to set
* MLK_CONFIG_API_PARAMETER_SET or MLK_CONFIG_API_NAMESPACE_PREFIX,
* nor include a configuration.
*
* # Multi-level builds
*
* This header specifies a build of mlkem-native for a fixed security level.
* If you need multiple builds, e.g. to build a library offering multiple
* security levels, you need multiple instances of this header.
*
* NOTE: In this case, you must rename or #undef the MLK_H header guard
* prior to subsequent inclusions of this file.
*
******************************************************************************/
/******************************* Key sizes ************************************/
/* Sizes of cryptographic material, per parameter set */
/* See mlkem/common.h for the arithmetic expressions giving rise to these */
/* check-magic: off */
#define MLKEM512_SECRETKEYBYTES 1632
#define MLKEM512_PUBLICKEYBYTES 800
#define MLKEM512_CIPHERTEXTBYTES 768
#define MLKEM768_SECRETKEYBYTES 2400
#define MLKEM768_PUBLICKEYBYTES 1184
#define MLKEM768_CIPHERTEXTBYTES 1088
#define MLKEM1024_SECRETKEYBYTES 3168
#define MLKEM1024_PUBLICKEYBYTES 1568
#define MLKEM1024_CIPHERTEXTBYTES 1568
/* check-magic: on */
/* Size of randomness coins in bytes (level-independent) */
#define MLKEM_SYMBYTES 32
#define MLKEM512_SYMBYTES MLKEM_SYMBYTES
#define MLKEM768_SYMBYTES MLKEM_SYMBYTES
#define MLKEM1024_SYMBYTES MLKEM_SYMBYTES
/* Size of shared secret in bytes (level-independent) */
#define MLKEM_BYTES 32
#define MLKEM512_BYTES MLKEM_BYTES
#define MLKEM768_BYTES MLKEM_BYTES
#define MLKEM1024_BYTES MLKEM_BYTES
/* Sizes of cryptographic material, as a function of LVL=512,768,1024 */
#define MLKEM_SECRETKEYBYTES_(LVL) MLKEM##LVL##_SECRETKEYBYTES
#define MLKEM_PUBLICKEYBYTES_(LVL) MLKEM##LVL##_PUBLICKEYBYTES
#define MLKEM_CIPHERTEXTBYTES_(LVL) MLKEM##LVL##_CIPHERTEXTBYTES
#define MLKEM_SECRETKEYBYTES(LVL) MLKEM_SECRETKEYBYTES_(LVL)
#define MLKEM_PUBLICKEYBYTES(LVL) MLKEM_PUBLICKEYBYTES_(LVL)
#define MLKEM_CIPHERTEXTBYTES(LVL) MLKEM_CIPHERTEXTBYTES_(LVL)
/****************************** Function API **********************************/
#if !defined(MLK_CONFIG_API_CONSTANTS_ONLY)
#if !defined(MLK_CONFIG_API_PARAMETER_SET)
#error MLK_CONFIG_API_PARAMETER_SET not defined
#endif
#if !defined(MLK_CONFIG_API_NAMESPACE_PREFIX)
#error MLK_CONFIG_API_NAMESPACE_PREFIX not defined
#endif
/* Derive namespacing macro */
#define MLK_API_CONCAT_(x, y) x##y
#define MLK_API_CONCAT(x, y) MLK_API_CONCAT_(x, y)
#define MLK_API_CONCAT_UNDERSCORE(x, y) MLK_API_CONCAT(MLK_API_CONCAT(x, _), y)
#define MLK_API_NAMESPACE(sym) \
MLK_API_CONCAT_UNDERSCORE(MLK_CONFIG_API_NAMESPACE_PREFIX, sym)
#if defined(__GNUC__) || defined(clang)
#define MLK_API_MUST_CHECK_RETURN_VALUE __attribute__((warn_unused_result))
#else
#define MLK_API_MUST_CHECK_RETURN_VALUE
#endif
#include <stdint.h>
/*************************************************
* Name: crypto_kem_keypair_derand
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
*
* Arguments: - uint8_t pk[]: pointer to output public key, an array of
* length MLKEM{512,768,1024}_PUBLICKEYBYTES bytes.
* - uint8_t sk[]: pointer to output private key, an array of
* of MLKEM{512,768,1024}_SECRETKEYBYTES bytes.
* - uint8_t *coins: pointer to input randomness, an array of
* 2*MLKEM_SYMBYTES uniformly random bytes.
*
* Returns: - 0: On success
* - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
*
* Specification: Implements @[FIPS203, Algorithm 16, ML-KEM.KeyGen_Internal]
*
**************************************************/
MLK_API_MUST_CHECK_RETURN_VALUE
int MLK_API_NAMESPACE(keypair_derand)(
uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)],
uint8_t sk[MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)],
const uint8_t coins[2 * MLKEM_SYMBYTES]);
/*************************************************
* Name: crypto_kem_keypair
*
* Description: Generates public and private key
* for CCA-secure ML-KEM key encapsulation mechanism
*
* Arguments: - uint8_t *pk: pointer to output public key, an array of
* MLKEM{512,768,1024}_PUBLICKEYBYTES bytes.
* - uint8_t *sk: pointer to output private key, an array of
* MLKEM{512,768,1024}_SECRETKEYBYTES bytes.
*
* Returns: - 0: On success
* - -1: On PCT failure (if MLK_CONFIG_KEYGEN_PCT) is enabled.
*
* Specification: Implements @[FIPS203, Algorithm 19, ML-KEM.KeyGen]
*
**************************************************/
MLK_API_MUST_CHECK_RETURN_VALUE
int MLK_API_NAMESPACE(keypair)(
uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)],
uint8_t sk[MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]);
/*************************************************
* Name: crypto_kem_enc_derand
*
* Description: Generates cipher text and shared
* secret for given public key
*
* Arguments: - uint8_t *ct: pointer to output cipher text, an array of
* MLKEM{512,768,1024}_CIPHERTEXTBYTES bytes.
* - uint8_t *ss: pointer to output shared secret, an array of
* MLKEM_BYTES bytes.
* - const uint8_t *pk: pointer to input public key, an array of
* MLKEM{512,768,1024}_PUBLICKEYBYTES bytes.
* - const uint8_t *coins: pointer to input randomness, an array of
* MLKEM_SYMBYTES bytes.
*
* Returns: - 0 on success
* - -1 if the 'modulus check' @[FIPS203, Section 7.2]
* for the public key fails.
*
* Specification: Implements @[FIPS203, Algorithm 17, ML-KEM.Encaps_Internal]
*
**************************************************/
MLK_API_MUST_CHECK_RETURN_VALUE
int MLK_API_NAMESPACE(enc_derand)(
uint8_t ct[MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_API_PARAMETER_SET)],
uint8_t ss[MLKEM_BYTES],
const uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)],
const uint8_t coins[MLKEM_SYMBYTES]);
/*************************************************
* Name: crypto_kem_enc
*
* Description: Generates cipher text and shared
* secret for given public key
*
* Arguments: - uint8_t *ct: pointer to output cipher text, an array of
* MLKEM{512,768,1024}_CIPHERTEXTBYTES bytes.
* - uint8_t *ss: pointer to output shared secret, an array of
* MLKEM_BYTES bytes.
* - const uint8_t *pk: pointer to input public key, an array of
* MLKEM{512,768,1024}_PUBLICKEYBYTES bytes.
*
* Returns: - 0 on success
* - -1 if the 'modulus check' @[FIPS203, Section 7.2]
* for the public key fails.
*
* Specification: Implements @[FIPS203, Algorithm 20, ML-KEM.Encaps]
*
**************************************************/
MLK_API_MUST_CHECK_RETURN_VALUE
int MLK_API_NAMESPACE(enc)(
uint8_t ct[MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_API_PARAMETER_SET)],
uint8_t ss[MLKEM_BYTES],
const uint8_t pk[MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]);
/*************************************************
* Name: crypto_kem_dec
*
* Description: Generates shared secret for given
* cipher text and private key
*
* Arguments: - uint8_t *ss: pointer to output shared secret, an array of
* MLKEM_BYTES bytes.
* - const uint8_t *ct: pointer to input cipher text, an array of
* MLKEM{512,768,1024}_CIPHERTEXTBYTES bytes.
* - const uint8_t *sk: pointer to input private key, an array of
* MLKEM{512,768,1024}_SECRETKEYBYTES bytes.
*
* Returns: - 0 on success
* - -1 if the 'hash check' @[FIPS203, Section 7.3]
* for the secret key fails.
*
* Specification: Implements @[FIPS203, Algorithm 21, ML-KEM.Decaps]
*
**************************************************/
MLK_API_MUST_CHECK_RETURN_VALUE
int MLK_API_NAMESPACE(dec)(
uint8_t ss[MLKEM_BYTES],
const uint8_t ct[MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_API_PARAMETER_SET)],
const uint8_t sk[MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)]);
/****************************** SUPERCOP API *********************************/
#if !defined(MLK_CONFIG_API_NO_SUPERCOP)
/* Export API in SUPERCOP naming scheme CRYPTO_xxx / crypto_kem_xxx */
#define CRYPTO_SECRETKEYBYTES MLKEM_SECRETKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)
#define CRYPTO_PUBLICKEYBYTES MLKEM_PUBLICKEYBYTES(MLK_CONFIG_API_PARAMETER_SET)
#define CRYPTO_CIPHERTEXTBYTES \
MLKEM_CIPHERTEXTBYTES(MLK_CONFIG_API_PARAMETER_SET)
#define CRYPTO_SYMBYTES MLKEM_SYMBYTES
#define CRYPTO_BYTES MLKEM_BYTES
#define crypto_kem_keypair_derand MLK_API_NAMESPACE(keypair_derand)
#define crypto_kem_keypair MLK_API_NAMESPACE(keypair)
#define crypto_kem_enc_derand MLK_API_NAMESPACE(enc_derand)
#define crypto_kem_enc MLK_API_NAMESPACE(enc)
#define crypto_kem_dec MLK_API_NAMESPACE(dec)
#else /* !MLK_CONFIG_API_NO_SUPERCOP */
/* If the SUPERCOP API is not needed, we can undefine the various helper macros
* above. Otherwise, they are needed for lazy evaluation of crypto_kem_xxx. */
#undef MLK_API_CONCAT
#undef MLK_API_CONCAT_
#undef MLK_API_CONCAT_UNDERSCORE
#undef MLK_API_NAMESPACE
#undef MLK_API_MUST_CHECK_RETURN_VALUE
#endif /* MLK_CONFIG_API_NO_SUPERCOP */
#endif /* !MLK_CONFIG_API_CONSTANTS_ONLY */
#endif /* !MLK_H */

View File

@@ -0,0 +1,441 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*
* WARNING: This file is auto-generated from scripts/autogen
* in the mlkem-native repository.
* Do not modify it directly.
*/
/******************************************************************************
*
* Single compilation unit (SCU) for fixed-level build of mlkem-native
*
* This compilation unit bundles together all source files for a build
* of mlkem-native for a fixed security level (MLKEM-512/768/1024).
*
* # API
*
* The API exposed by this file is described in mlkem_native.h.
*
* # Multi-level build
*
* If you want an SCU build of mlkem-native with support for multiple security
* levels, you need to include this file multiple times, and set
* MLK_CONFIG_MULTILEVEL_WITH_SHARED and MLK_CONFIG_MULTILEVEL_NO_SHARED
* appropriately. This is exemplified in examples/monolithic_build_multilevel
* and examples/monolithic_build_multilevel_native.
*
* # Configuration
*
* The following options from the mlkem-native configuration are relevant:
*
* - MLK_CONFIG_FIPS202_CUSTOM_HEADER
* Set this option if you use a custom FIPS202 implementation.
*
* - MLK_CONFIG_USE_NATIVE_BACKEND_ARITH
* Set this option if you want to include the native arithmetic backends
* in your build.
*
* - MLK_CONFIG_USE_NATIVE_BACKEND_FIPS202
* Set this option if you want to include the native FIPS202 backends
* in your build.
*
* - MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS
* Set this option if you want to keep the directives defined in
* level-independent headers. This is needed for a multi-level build.
*/
/* If parts of the mlkem-native source tree are not used,
* consider reducing this header via `unifdef`.
*
* Example:
* ```bash
* unifdef -UMLK_CONFIG_USE_NATIVE_BACKEND_ARITH mlkem_native.c
* ```
*/
#include "common.h"
#include "compress.c"
#include "debug.c"
#include "indcpa.c"
#include "kem.c"
#include "poly.c"
#include "poly_k.c"
#include "sampling.c"
#include "verify.c"
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
#if defined(MLK_SYS_AARCH64)
#include "native/aarch64/src/aarch64_zetas.c"
#include "native/aarch64/src/rej_uniform_table.c"
#endif
#if defined(MLK_SYS_X86_64)
#include "native/x86_64/src/consts.c"
#include "native/x86_64/src/rej_uniform_table.c"
#endif
#endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */
/* Macro #undef's
*
* The following undefines macros from headers
* included by the source files imported above.
*
* This is to allow building and linking multiple builds
* of mlkem-native for varying parameter sets through concatenation
* of this file, as if the files had been compiled separately.
* If this is not relevant to you, you may remove the following.
*/
/*
* Undefine macros from MLK_CONFIG_PARAMETER_SET-specific files
*/
/* mlkem/mlkem_native.h */
#undef CRYPTO_BYTES
#undef CRYPTO_CIPHERTEXTBYTES
#undef CRYPTO_PUBLICKEYBYTES
#undef CRYPTO_SECRETKEYBYTES
#undef CRYPTO_SYMBYTES
#undef MLKEM1024_BYTES
#undef MLKEM1024_CIPHERTEXTBYTES
#undef MLKEM1024_PUBLICKEYBYTES
#undef MLKEM1024_SECRETKEYBYTES
#undef MLKEM1024_SYMBYTES
#undef MLKEM512_BYTES
#undef MLKEM512_CIPHERTEXTBYTES
#undef MLKEM512_PUBLICKEYBYTES
#undef MLKEM512_SECRETKEYBYTES
#undef MLKEM512_SYMBYTES
#undef MLKEM768_BYTES
#undef MLKEM768_CIPHERTEXTBYTES
#undef MLKEM768_PUBLICKEYBYTES
#undef MLKEM768_SECRETKEYBYTES
#undef MLKEM768_SYMBYTES
#undef MLKEM_BYTES
#undef MLKEM_CIPHERTEXTBYTES
#undef MLKEM_CIPHERTEXTBYTES_
#undef MLKEM_PUBLICKEYBYTES
#undef MLKEM_PUBLICKEYBYTES_
#undef MLKEM_SECRETKEYBYTES
#undef MLKEM_SECRETKEYBYTES_
#undef MLKEM_SYMBYTES
#undef MLK_API_CONCAT
#undef MLK_API_CONCAT_
#undef MLK_API_CONCAT_UNDERSCORE
#undef MLK_API_MUST_CHECK_RETURN_VALUE
#undef MLK_API_NAMESPACE
#undef MLK_H
#undef crypto_kem_dec
#undef crypto_kem_enc
#undef crypto_kem_enc_derand
#undef crypto_kem_keypair
#undef crypto_kem_keypair_derand
/* mlkem/src/common.h */
#undef MLK_ADD_PARAM_SET
#undef MLK_ASM_FN_SYMBOL
#undef MLK_ASM_NAMESPACE
#undef MLK_COMMON_H
#undef MLK_CONCAT
#undef MLK_CONCAT_
#undef MLK_CONFIG_API_NAMESPACE_PREFIX
#undef MLK_CONFIG_API_PARAMETER_SET
#undef MLK_EMPTY_CU
#undef MLK_EXTERNAL_API
#undef MLK_FIPS202X4_HEADER_FILE
#undef MLK_FIPS202_HEADER_FILE
#undef MLK_INTERNAL_API
#undef MLK_MULTILEVEL_BUILD
#undef MLK_NAMESPACE
#undef MLK_NAMESPACE_K
#undef MLK_NAMESPACE_PREFIX
#undef MLK_NAMESPACE_PREFIX_K
#undef mlk_memcpy
#undef mlk_memset
/* mlkem/src/indcpa.h */
#undef MLK_INDCPA_H
#undef mlk_gen_matrix
#undef mlk_indcpa_dec
#undef mlk_indcpa_enc
#undef mlk_indcpa_keypair_derand
/* mlkem/src/kem.h */
#undef MLK_CONFIG_API_NO_SUPERCOP
#undef MLK_KEM_H
#undef crypto_kem_check_pk
#undef crypto_kem_check_sk
#undef crypto_kem_dec
#undef crypto_kem_enc
#undef crypto_kem_enc_derand
#undef crypto_kem_keypair
#undef crypto_kem_keypair_derand
/* mlkem/src/params.h */
#undef MLKEM_DU
#undef MLKEM_DV
#undef MLKEM_ETA1
#undef MLKEM_ETA2
#undef MLKEM_INDCCA_CIPHERTEXTBYTES
#undef MLKEM_INDCCA_PUBLICKEYBYTES
#undef MLKEM_INDCCA_SECRETKEYBYTES
#undef MLKEM_INDCPA_BYTES
#undef MLKEM_INDCPA_MSGBYTES
#undef MLKEM_INDCPA_PUBLICKEYBYTES
#undef MLKEM_INDCPA_SECRETKEYBYTES
#undef MLKEM_K
#undef MLKEM_N
#undef MLKEM_POLYBYTES
#undef MLKEM_POLYCOMPRESSEDBYTES_D10
#undef MLKEM_POLYCOMPRESSEDBYTES_D11
#undef MLKEM_POLYCOMPRESSEDBYTES_D4
#undef MLKEM_POLYCOMPRESSEDBYTES_D5
#undef MLKEM_POLYCOMPRESSEDBYTES_DU
#undef MLKEM_POLYCOMPRESSEDBYTES_DV
#undef MLKEM_POLYVECBYTES
#undef MLKEM_POLYVECCOMPRESSEDBYTES_DU
#undef MLKEM_Q
#undef MLKEM_Q_HALF
#undef MLKEM_SSBYTES
#undef MLKEM_SYMBYTES
#undef MLKEM_UINT12_LIMIT
#undef MLK_PARAMS_H
/* mlkem/src/poly_k.h */
#undef MLK_POLY_K_H
#undef mlk_poly_compress_du
#undef mlk_poly_compress_dv
#undef mlk_poly_decompress_du
#undef mlk_poly_decompress_dv
#undef mlk_poly_getnoise_eta1122_4x
#undef mlk_poly_getnoise_eta1_4x
#undef mlk_poly_getnoise_eta2
#undef mlk_poly_getnoise_eta2_4x
#undef mlk_polymat
#undef mlk_polyvec
#undef mlk_polyvec_add
#undef mlk_polyvec_basemul_acc_montgomery_cached
#undef mlk_polyvec_compress_du
#undef mlk_polyvec_decompress_du
#undef mlk_polyvec_frombytes
#undef mlk_polyvec_invntt_tomont
#undef mlk_polyvec_mulcache
#undef mlk_polyvec_mulcache_compute
#undef mlk_polyvec_ntt
#undef mlk_polyvec_reduce
#undef mlk_polyvec_tobytes
#undef mlk_polyvec_tomont
#if !defined(MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS)
/*
* Undefine macros from MLK_CONFIG_PARAMETER_SET-generic files
*/
/* mlkem/src/compress.h */
#undef MLK_COMPRESS_H
#undef mlk_poly_compress_d10
#undef mlk_poly_compress_d11
#undef mlk_poly_compress_d4
#undef mlk_poly_compress_d5
#undef mlk_poly_decompress_d10
#undef mlk_poly_decompress_d11
#undef mlk_poly_decompress_d4
#undef mlk_poly_decompress_d5
#undef mlk_poly_frombytes
#undef mlk_poly_frommsg
#undef mlk_poly_tobytes
#undef mlk_poly_tomsg
/* mlkem/src/debug.h */
#undef MLK_DEBUG_H
#undef mlk_assert
#undef mlk_assert_abs_bound
#undef mlk_assert_abs_bound_2d
#undef mlk_assert_bound
#undef mlk_assert_bound_2d
#undef mlk_debug_check_assert
#undef mlk_debug_check_bounds
/* mlkem/src/poly.h */
#undef MLK_INVNTT_BOUND
#undef MLK_NTT_BOUND
#undef MLK_POLY_H
#undef mlk_poly_add
#undef mlk_poly_invntt_tomont
#undef mlk_poly_mulcache_compute
#undef mlk_poly_ntt
#undef mlk_poly_reduce
#undef mlk_poly_sub
#undef mlk_poly_tomont
/* mlkem/src/randombytes.h */
#undef MLK_RANDOMBYTES_H
/* mlkem/src/sampling.h */
#undef MLK_SAMPLING_H
#undef mlk_poly_cbd2
#undef mlk_poly_cbd3
#undef mlk_poly_rej_uniform
#undef mlk_poly_rej_uniform_x4
/* mlkem/src/symmetric.h */
#undef MLK_SYMMETRIC_H
#undef MLK_XOF_RATE
#undef mlk_hash_g
#undef mlk_hash_h
#undef mlk_hash_j
#undef mlk_prf_eta
#undef mlk_prf_eta1
#undef mlk_prf_eta1_x4
#undef mlk_prf_eta2
#undef mlk_xof_absorb
#undef mlk_xof_ctx
#undef mlk_xof_init
#undef mlk_xof_release
#undef mlk_xof_squeezeblocks
#undef mlk_xof_x4_absorb
#undef mlk_xof_x4_ctx
#undef mlk_xof_x4_init
#undef mlk_xof_x4_release
#undef mlk_xof_x4_squeezeblocks
/* mlkem/src/sys.h */
#undef MLK_ALIGN
#undef MLK_ALIGN_UP
#undef MLK_ALWAYS_INLINE
#undef MLK_CET_ENDBR
#undef MLK_CT_TESTING_DECLASSIFY
#undef MLK_CT_TESTING_SECRET
#undef MLK_DEFAULT_ALIGN
#undef MLK_HAVE_INLINE_ASM
#undef MLK_INLINE
#undef MLK_MUST_CHECK_RETURN_VALUE
#undef MLK_RESTRICT
#undef MLK_SYS_AARCH64
#undef MLK_SYS_AARCH64_EB
#undef MLK_SYS_APPLE
#undef MLK_SYS_BIG_ENDIAN
#undef MLK_SYS_H
#undef MLK_SYS_LINUX
#undef MLK_SYS_LITTLE_ENDIAN
#undef MLK_SYS_PPC64LE
#undef MLK_SYS_RISCV32
#undef MLK_SYS_RISCV64
#undef MLK_SYS_WINDOWS
#undef MLK_SYS_X86_64
#undef MLK_SYS_X86_64_AVX2
/* mlkem/src/verify.h */
#undef MLK_USE_ASM_VALUE_BARRIER
#undef MLK_VERIFY_H
#undef mlk_ct_opt_blocker_u64
/* mlkem/src/cbmc.h */
#undef MLK_CBMC_H
#undef __contract__
#undef __loop__
#if defined(MLK_CONFIG_USE_NATIVE_BACKEND_ARITH)
/* mlkem/src/native/api.h */
#undef MLK_INVNTT_BOUND
#undef MLK_NATIVE_API_H
#undef MLK_NATIVE_FUNC_FALLBACK
#undef MLK_NATIVE_FUNC_SUCCESS
#undef MLK_NTT_BOUND
/* mlkem/src/native/meta.h */
#undef MLK_NATIVE_META_H
#if defined(MLK_SYS_AARCH64)
/*
* Undefine macros from native code (Arith, AArch64)
*/
/* mlkem/src/native/aarch64/meta.h */
#undef MLK_ARITH_BACKEND_AARCH64
#undef MLK_NATIVE_AARCH64_META_H
#undef MLK_USE_NATIVE_INTT
#undef MLK_USE_NATIVE_NTT
#undef MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
#undef MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE
#undef MLK_USE_NATIVE_POLY_REDUCE
#undef MLK_USE_NATIVE_POLY_TOBYTES
#undef MLK_USE_NATIVE_POLY_TOMONT
#undef MLK_USE_NATIVE_REJ_UNIFORM
/* mlkem/src/native/aarch64/src/arith_native_aarch64.h */
#undef MLK_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H
#undef mlk_aarch64_invntt_zetas_layer12345
#undef mlk_aarch64_invntt_zetas_layer67
#undef mlk_aarch64_ntt_zetas_layer12345
#undef mlk_aarch64_ntt_zetas_layer67
#undef mlk_aarch64_zetas_mulcache_native
#undef mlk_aarch64_zetas_mulcache_twisted_native
#undef mlk_intt_asm
#undef mlk_ntt_asm
#undef mlk_poly_mulcache_compute_asm
#undef mlk_poly_reduce_asm
#undef mlk_poly_tobytes_asm
#undef mlk_poly_tomont_asm
#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k2
#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k3
#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k4
#undef mlk_rej_uniform_asm
#undef mlk_rej_uniform_table
#endif /* MLK_SYS_AARCH64 */
#if defined(MLK_SYS_X86_64)
/*
* Undefine macros from native code (Arith, X86_64)
*/
/* mlkem/src/native/x86_64/meta.h */
#undef MLK_ARITH_BACKEND_X86_64_DEFAULT
#undef MLK_NATIVE_X86_64_META_H
#undef MLK_USE_NATIVE_INTT
#undef MLK_USE_NATIVE_NTT
#undef MLK_USE_NATIVE_NTT_CUSTOM_ORDER
#undef MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
#undef MLK_USE_NATIVE_POLY_COMPRESS_D10
#undef MLK_USE_NATIVE_POLY_COMPRESS_D11
#undef MLK_USE_NATIVE_POLY_COMPRESS_D4
#undef MLK_USE_NATIVE_POLY_COMPRESS_D5
#undef MLK_USE_NATIVE_POLY_DECOMPRESS_D10
#undef MLK_USE_NATIVE_POLY_DECOMPRESS_D11
#undef MLK_USE_NATIVE_POLY_DECOMPRESS_D4
#undef MLK_USE_NATIVE_POLY_DECOMPRESS_D5
#undef MLK_USE_NATIVE_POLY_FROMBYTES
#undef MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE
#undef MLK_USE_NATIVE_POLY_REDUCE
#undef MLK_USE_NATIVE_POLY_TOBYTES
#undef MLK_USE_NATIVE_POLY_TOMONT
#undef MLK_USE_NATIVE_REJ_UNIFORM
/* mlkem/src/native/x86_64/src/arith_native_x86_64.h */
#undef MLK_AVX2_REJ_UNIFORM_BUFLEN
#undef MLK_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H
#undef mlk_invntt_avx2
#undef mlk_ntt_avx2
#undef mlk_nttfrombytes_avx2
#undef mlk_ntttobytes_avx2
#undef mlk_nttunpack_avx2
#undef mlk_poly_compress_d10_avx2
#undef mlk_poly_compress_d11_avx2
#undef mlk_poly_compress_d4_avx2
#undef mlk_poly_compress_d5_avx2
#undef mlk_poly_decompress_d10_avx2
#undef mlk_poly_decompress_d11_avx2
#undef mlk_poly_decompress_d4_avx2
#undef mlk_poly_decompress_d5_avx2
#undef mlk_poly_mulcache_compute_avx2
#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k2
#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k3
#undef mlk_polyvec_basemul_acc_montgomery_cached_asm_k4
#undef mlk_reduce_avx2
#undef mlk_rej_uniform_asm
#undef mlk_rej_uniform_table
#undef mlk_tomont_avx2
/* mlkem/src/native/x86_64/src/consts.h */
#undef MLK_AVX2_BACKEND_DATA_OFFSET_16XFHI
#undef MLK_AVX2_BACKEND_DATA_OFFSET_16XFLO
#undef MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK
#undef MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI
#undef MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO
#undef MLK_AVX2_BACKEND_DATA_OFFSET_16XQ
#undef MLK_AVX2_BACKEND_DATA_OFFSET_16XQINV
#undef MLK_AVX2_BACKEND_DATA_OFFSET_16XSHIFT
#undef MLK_AVX2_BACKEND_DATA_OFFSET_16XV
#undef MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES
#undef MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB
#undef MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD
#undef MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP
#undef MLK_NATIVE_X86_64_SRC_CONSTS_H
#undef mlk_qdata
#endif /* MLK_SYS_X86_64 */
#endif /* MLK_CONFIG_USE_NATIVE_BACKEND_ARITH */
#endif /* !MLK_CONFIG_MONOBUILD_KEEP_SHARED_HEADERS */

View File

@@ -0,0 +1,112 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#ifndef MLK_NATIVE_AARCH64_META_H
#define MLK_NATIVE_AARCH64_META_H
/* Set of primitives that this backend replaces */
#define MLK_USE_NATIVE_NTT
#define MLK_USE_NATIVE_INTT
#define MLK_USE_NATIVE_POLY_REDUCE
#define MLK_USE_NATIVE_POLY_TOMONT
#define MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE
#define MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
#define MLK_USE_NATIVE_POLY_TOBYTES
#define MLK_USE_NATIVE_REJ_UNIFORM
/* Identifier for this backend so that source and assembly files
* in the build can be appropriately guarded. */
#define MLK_ARITH_BACKEND_AARCH64
#if !defined(__ASSEMBLER__)
#include "../api.h"
#include "src/arith_native_aarch64.h"
static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N])
{
mlk_ntt_asm(data, mlk_aarch64_ntt_zetas_layer12345,
mlk_aarch64_ntt_zetas_layer67);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N])
{
mlk_intt_asm(data, mlk_aarch64_invntt_zetas_layer12345,
mlk_aarch64_invntt_zetas_layer67);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N])
{
mlk_poly_reduce_asm(data);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N])
{
mlk_poly_tomont_asm(data);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
const int16_t y[MLKEM_N])
{
mlk_poly_mulcache_compute_asm(x, y, mlk_aarch64_zetas_mulcache_native,
mlk_aarch64_zetas_mulcache_twisted_native);
return MLK_NATIVE_FUNC_SUCCESS;
}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2
static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(r, a, b, b_cache);
return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3
static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(r, a, b, b_cache);
return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
{
mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(r, a, b, b_cache);
return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
const int16_t a[MLKEM_N])
{
mlk_poly_tobytes_asm(r, a);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
{
if (len != MLKEM_N ||
buflen % 24 != 0) /* NEON support is mandatory for AArch64 */
{
return MLK_NATIVE_FUNC_FALLBACK;
}
return (int)mlk_rej_uniform_asm(r, buf, buflen, mlk_rej_uniform_table);
}
#endif /* !__ASSEMBLER__ */
#endif /* !MLK_NATIVE_AARCH64_META_H */

View File

@@ -0,0 +1,175 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*
* WARNING: This file is auto-generated from scripts/autogen
* in the mlkem-native repository.
* Do not modify it directly.
*/
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_AARCH64) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include <stdint.h>
#include "arith_native_aarch64.h"
/*
* Table of zeta values used in the AArch64 forward NTT
* See autogen for details.
*/
MLK_ALIGN const int16_t mlk_aarch64_ntt_zetas_layer12345[] = {
-1600, -15749, -749, -7373, -40, -394, -687, -6762, 630, 6201,
-1432, -14095, 848, 8347, 0, 0, 1062, 10453, 296, 2914,
-882, -8682, 0, 0, -1410, -13879, 1339, 13180, 1476, 14529,
0, 0, 193, 1900, -283, -2786, 56, 551, 0, 0,
797, 7845, -1089, -10719, 1333, 13121, 0, 0, -543, -5345,
1426, 14036, -1235, -12156, 0, 0, -69, -679, 535, 5266,
-447, -4400, 0, 0, 569, 5601, -936, -9213, -450, -4429,
0, 0, -1583, -15582, -1355, -13338, 821, 8081, 0, 0,
};
MLK_ALIGN const int16_t mlk_aarch64_ntt_zetas_layer67[] = {
289, 289, 331, 331, -76, -76, -1573, -1573, 2845,
2845, 3258, 3258, -748, -748, -15483, -15483, 17, 17,
583, 583, 1637, 1637, -1041, -1041, 167, 167, 5739,
5739, 16113, 16113, -10247, -10247, -568, -568, -680, -680,
723, 723, 1100, 1100, -5591, -5591, -6693, -6693, 7117,
7117, 10828, 10828, 1197, 1197, -1025, -1025, -1052, -1052,
-1274, -1274, 11782, 11782, -10089, -10089, -10355, -10355, -12540,
-12540, 1409, 1409, -48, -48, 756, 756, -314, -314,
13869, 13869, -472, -472, 7441, 7441, -3091, -3091, -667,
-667, 233, 233, -1173, -1173, -279, -279, -6565, -6565,
2293, 2293, -11546, -11546, -2746, -2746, 650, 650, -1352,
-1352, -816, -816, 632, 632, 6398, 6398, -13308, -13308,
-8032, -8032, 6221, 6221, -1626, -1626, -540, -540, -1482,
-1482, 1461, 1461, -16005, -16005, -5315, -5315, -14588, -14588,
14381, 14381, 1651, 1651, -1540, -1540, 952, 952, -642,
-642, 16251, 16251, -15159, -15159, 9371, 9371, -6319, -6319,
-464, -464, 33, 33, 1320, 1320, -1414, -1414, -4567,
-4567, 325, 325, 12993, 12993, -13918, -13918, 939, 939,
-892, -892, 733, 733, 268, 268, 9243, 9243, -8780,
-8780, 7215, 7215, 2638, 2638, -1021, -1021, -941, -941,
-992, -992, 641, 641, -10050, -10050, -9262, -9262, -9764,
-9764, 6309, 6309, -1010, -1010, 1435, 1435, 807, 807,
452, 452, -9942, -9942, 14125, 14125, 7943, 7943, 4449,
4449, 1584, 1584, -1292, -1292, 375, 375, -1239, -1239,
15592, 15592, -12717, -12717, 3691, 3691, -12196, -12196, -1031,
-1031, -109, -109, -780, -780, 1645, 1645, -10148, -10148,
-1073, -1073, -7678, -7678, 16192, 16192, 1438, 1438, -461,
-461, 1534, 1534, -927, -927, 14155, 14155, -4538, -4538,
15099, 15099, -9125, -9125, 1063, 1063, -556, -556, -1230,
-1230, -863, -863, 10463, 10463, -5473, -5473, -12107, -12107,
-8495, -8495, 319, 319, 757, 757, 561, 561, -735,
-735, 3140, 3140, 7451, 7451, 5522, 5522, -7235, -7235,
-682, -682, -712, -712, 1481, 1481, 648, 648, -6713,
-6713, -7008, -7008, 14578, 14578, 6378, 6378, -525, -525,
403, 403, 1143, 1143, -554, -554, -5168, -5168, 3967,
3967, 11251, 11251, -5453, -5453, 1092, 1092, 1026, 1026,
-1179, -1179, 886, 886, 10749, 10749, 10099, 10099, -11605,
-11605, 8721, 8721, -855, -855, -219, -219, 1227, 1227,
910, 910, -8416, -8416, -2156, -2156, 12078, 12078, 8957,
8957, -1607, -1607, -1455, -1455, -1219, -1219, 885, 885,
-15818, -15818, -14322, -14322, -11999, -11999, 8711, 8711, 1212,
1212, 1029, 1029, -394, -394, -1175, -1175, 11930, 11930,
10129, 10129, -3878, -3878, -11566, -11566,
};
MLK_ALIGN const int16_t mlk_aarch64_invntt_zetas_layer12345[] = {
1583, 15582, -821, -8081, 1355, 13338, 0, 0, -569, -5601,
450, 4429, 936, 9213, 0, 0, 69, 679, 447, 4400,
-535, -5266, 0, 0, 543, 5345, 1235, 12156, -1426, -14036,
0, 0, -797, -7845, -1333, -13121, 1089, 10719, 0, 0,
-193, -1900, -56, -551, 283, 2786, 0, 0, 1410, 13879,
-1476, -14529, -1339, -13180, 0, 0, -1062, -10453, 882, 8682,
-296, -2914, 0, 0, 1600, 15749, 40, 394, 749, 7373,
-848, -8347, 1432, 14095, -630, -6201, 687, 6762, 0, 0,
};
MLK_ALIGN const int16_t mlk_aarch64_invntt_zetas_layer67[] = {
-910, -910, -1227, -1227, 219, 219, 855, 855, -8957,
-8957, -12078, -12078, 2156, 2156, 8416, 8416, 1175, 1175,
394, 394, -1029, -1029, -1212, -1212, 11566, 11566, 3878,
3878, -10129, -10129, -11930, -11930, -885, -885, 1219, 1219,
1455, 1455, 1607, 1607, -8711, -8711, 11999, 11999, 14322,
14322, 15818, 15818, -648, -648, -1481, -1481, 712, 712,
682, 682, -6378, -6378, -14578, -14578, 7008, 7008, 6713,
6713, -886, -886, 1179, 1179, -1026, -1026, -1092, -1092,
-8721, -8721, 11605, 11605, -10099, -10099, -10749, -10749, 554,
554, -1143, -1143, -403, -403, 525, 525, 5453, 5453,
-11251, -11251, -3967, -3967, 5168, 5168, 927, 927, -1534,
-1534, 461, 461, -1438, -1438, 9125, 9125, -15099, -15099,
4538, 4538, -14155, -14155, 735, 735, -561, -561, -757,
-757, -319, -319, 7235, 7235, -5522, -5522, -7451, -7451,
-3140, -3140, 863, 863, 1230, 1230, 556, 556, -1063,
-1063, 8495, 8495, 12107, 12107, 5473, 5473, -10463, -10463,
-452, -452, -807, -807, -1435, -1435, 1010, 1010, -4449,
-4449, -7943, -7943, -14125, -14125, 9942, 9942, -1645, -1645,
780, 780, 109, 109, 1031, 1031, -16192, -16192, 7678,
7678, 1073, 1073, 10148, 10148, 1239, 1239, -375, -375,
1292, 1292, -1584, -1584, 12196, 12196, -3691, -3691, 12717,
12717, -15592, -15592, 1414, 1414, -1320, -1320, -33, -33,
464, 464, 13918, 13918, -12993, -12993, -325, -325, 4567,
4567, -641, -641, 992, 992, 941, 941, 1021, 1021,
-6309, -6309, 9764, 9764, 9262, 9262, 10050, 10050, -268,
-268, -733, -733, 892, 892, -939, -939, -2638, -2638,
-7215, -7215, 8780, 8780, -9243, -9243, -632, -632, 816,
816, 1352, 1352, -650, -650, -6221, -6221, 8032, 8032,
13308, 13308, -6398, -6398, 642, 642, -952, -952, 1540,
1540, -1651, -1651, 6319, 6319, -9371, -9371, 15159, 15159,
-16251, -16251, -1461, -1461, 1482, 1482, 540, 540, 1626,
1626, -14381, -14381, 14588, 14588, 5315, 5315, 16005, 16005,
1274, 1274, 1052, 1052, 1025, 1025, -1197, -1197, 12540,
12540, 10355, 10355, 10089, 10089, -11782, -11782, 279, 279,
1173, 1173, -233, -233, 667, 667, 2746, 2746, 11546,
11546, -2293, -2293, 6565, 6565, 314, 314, -756, -756,
48, 48, -1409, -1409, 3091, 3091, -7441, -7441, 472,
472, -13869, -13869, 1573, 1573, 76, 76, -331, -331,
-289, -289, 15483, 15483, 748, 748, -3258, -3258, -2845,
-2845, -1100, -1100, -723, -723, 680, 680, 568, 568,
-10828, -10828, -7117, -7117, 6693, 6693, 5591, 5591, 1041,
1041, -1637, -1637, -583, -583, -17, -17, 10247, 10247,
-16113, -16113, -5739, -5739, -167, -167,
};
MLK_ALIGN const int16_t mlk_aarch64_zetas_mulcache_native[] = {
17, -17, -568, 568, 583, -583, -680, 680, 1637, -1637, 723,
-723, -1041, 1041, 1100, -1100, 1409, -1409, -667, 667, -48, 48,
233, -233, 756, -756, -1173, 1173, -314, 314, -279, 279, -1626,
1626, 1651, -1651, -540, 540, -1540, 1540, -1482, 1482, 952, -952,
1461, -1461, -642, 642, 939, -939, -1021, 1021, -892, 892, -941,
941, 733, -733, -992, 992, 268, -268, 641, -641, 1584, -1584,
-1031, 1031, -1292, 1292, -109, 109, 375, -375, -780, 780, -1239,
1239, 1645, -1645, 1063, -1063, 319, -319, -556, 556, 757, -757,
-1230, 1230, 561, -561, -863, 863, -735, 735, -525, 525, 1092,
-1092, 403, -403, 1026, -1026, 1143, -1143, -1179, 1179, -554, 554,
886, -886, -1607, 1607, 1212, -1212, -1455, 1455, 1029, -1029, -1219,
1219, -394, 394, 885, -885, -1175, 1175,
};
MLK_ALIGN const int16_t mlk_aarch64_zetas_mulcache_twisted_native[] = {
167, -167, -5591, 5591, 5739, -5739, -6693, 6693, 16113,
-16113, 7117, -7117, -10247, 10247, 10828, -10828, 13869, -13869,
-6565, 6565, -472, 472, 2293, -2293, 7441, -7441, -11546,
11546, -3091, 3091, -2746, 2746, -16005, 16005, 16251, -16251,
-5315, 5315, -15159, 15159, -14588, 14588, 9371, -9371, 14381,
-14381, -6319, 6319, 9243, -9243, -10050, 10050, -8780, 8780,
-9262, 9262, 7215, -7215, -9764, 9764, 2638, -2638, 6309,
-6309, 15592, -15592, -10148, 10148, -12717, 12717, -1073, 1073,
3691, -3691, -7678, 7678, -12196, 12196, 16192, -16192, 10463,
-10463, 3140, -3140, -5473, 5473, 7451, -7451, -12107, 12107,
5522, -5522, -8495, 8495, -7235, 7235, -5168, 5168, 10749,
-10749, 3967, -3967, 10099, -10099, 11251, -11251, -11605, 11605,
-5453, 5453, 8721, -8721, -15818, 15818, 11930, -11930, -14322,
14322, 10129, -10129, -11999, 11999, -3878, 3878, 8711, -8711,
-11566, 11566,
};
#else /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(aarch64_zetas)
#endif /* !(MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED) */

View File

@@ -0,0 +1,177 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#ifndef MLK_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H
#define MLK_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H
#include <stdint.h>
#include "../../../cbmc.h"
#include "../../../common.h"
#define mlk_aarch64_ntt_zetas_layer12345 \
MLK_NAMESPACE(aarch64_ntt_zetas_layer12345)
#define mlk_aarch64_ntt_zetas_layer67 MLK_NAMESPACE(aarch64_ntt_zetas_layer67)
#define mlk_aarch64_invntt_zetas_layer12345 \
MLK_NAMESPACE(aarch64_invntt_zetas_layer12345)
#define mlk_aarch64_invntt_zetas_layer67 \
MLK_NAMESPACE(aarch64_invntt_zetas_layer67)
#define mlk_aarch64_zetas_mulcache_native \
MLK_NAMESPACE(aarch64_zetas_mulcache_native)
#define mlk_aarch64_zetas_mulcache_twisted_native \
MLK_NAMESPACE(aarch64_zetas_mulcache_twisted_native)
#define mlk_rej_uniform_table MLK_NAMESPACE(rej_uniform_table)
extern const int16_t mlk_aarch64_ntt_zetas_layer12345[];
extern const int16_t mlk_aarch64_ntt_zetas_layer67[];
extern const int16_t mlk_aarch64_invntt_zetas_layer12345[];
extern const int16_t mlk_aarch64_invntt_zetas_layer67[];
extern const int16_t mlk_aarch64_zetas_mulcache_native[];
extern const int16_t mlk_aarch64_zetas_mulcache_twisted_native[];
extern const uint8_t mlk_rej_uniform_table[];
#define mlk_ntt_asm MLK_NAMESPACE(ntt_asm)
void mlk_ntt_asm(int16_t p[256], const int16_t twiddles12345[80],
const int16_t twiddles56[384])
/* This must be kept in sync with the HOL-Light specification
* in proofs/hol_light/arm/proofs/mlkem_ntt.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(p, 0, MLKEM_N, 8192))
requires(twiddles12345 == mlk_aarch64_ntt_zetas_layer12345)
requires(twiddles56 == mlk_aarch64_ntt_zetas_layer67)
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
/* check-magic: off */
ensures(array_abs_bound(p, 0, MLKEM_N, 23595))
/* check-magic: on */
);
#define mlk_intt_asm MLK_NAMESPACE(intt_asm)
void mlk_intt_asm(int16_t p[256], const int16_t twiddles12345[80],
const int16_t twiddles56[384])
/* This must be kept in sync with the HOL-Light specification
* in proofs/hol_light/arm/proofs/mlkem_intt.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(twiddles12345 == mlk_aarch64_invntt_zetas_layer12345)
requires(twiddles56 == mlk_aarch64_invntt_zetas_layer67)
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
/* check-magic: off */
ensures(array_abs_bound(p, 0, MLKEM_N, 26625))
/* check-magic: on */
);
#define mlk_poly_reduce_asm MLK_NAMESPACE(poly_reduce_asm)
void mlk_poly_reduce_asm(int16_t p[256])
/* This must be kept in sync with the HOL-Light specification
* in proofs/hol_light/arm/proofs/mlkem_poly_reduce.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
ensures(array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
);
#define mlk_poly_tomont_asm MLK_NAMESPACE(poly_tomont_asm)
void mlk_poly_tomont_asm(int16_t p[256])
/* This must be kept in sync with the HOL-Light specification
* in proofs/hol_light/arm/proofs/mlkem_poly_tomont.ml */
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
ensures(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
);
#define mlk_poly_mulcache_compute_asm MLK_NAMESPACE(poly_mulcache_compute_asm)
void mlk_poly_mulcache_compute_asm(int16_t cache[128],
const int16_t mlk_poly[256],
const int16_t zetas[128],
const int16_t zetas_twisted[128])
/* This must be kept in sync with the HOL-Light specification
* in proofs/hol_light/arm/proofs/mlkem_poly_mulcache_compute.ml */
__contract__(
requires(memory_no_alias(cache, sizeof(int16_t) * (MLKEM_N / 2)))
requires(memory_no_alias(mlk_poly, sizeof(int16_t) * MLKEM_N))
requires(zetas == mlk_aarch64_zetas_mulcache_native)
requires(zetas_twisted == mlk_aarch64_zetas_mulcache_twisted_native)
assigns(object_whole(cache))
ensures(array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
);
#define mlk_poly_tobytes_asm MLK_NAMESPACE(poly_tobytes_asm)
void mlk_poly_tobytes_asm(uint8_t r[384], const int16_t a[256])
/* This must be kept in sync with the HOL-Light specification
* in proofs/hol_light/arm/proofs/mlkem_poly_tobytes.ml */
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(object_whole(r))
);
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k2 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(
int16_t r[256], const int16_t a[512], const int16_t b[512],
const int16_t b_cache[256])
/* This must be kept in sync with the HOL-Light specification in
* proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k2.ml.
*/
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(memory_no_alias(a, sizeof(int16_t) * 2 * MLKEM_N))
requires(memory_no_alias(b, sizeof(int16_t) * 2 * MLKEM_N))
requires(memory_no_alias(b_cache, sizeof(int16_t) * 2 * (MLKEM_N / 2)))
requires(array_abs_bound(a, 0, 2 * MLKEM_N, MLKEM_UINT12_LIMIT + 1))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
);
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k3 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(
int16_t r[256], const int16_t a[768], const int16_t b[768],
const int16_t b_cache[384])
/* This must be kept in sync with the HOL-Light specification in
* proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k3.ml.
*/
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(memory_no_alias(a, sizeof(int16_t) * 3 * MLKEM_N))
requires(memory_no_alias(b, sizeof(int16_t) * 3 * MLKEM_N))
requires(memory_no_alias(b_cache, sizeof(int16_t) * 3 * (MLKEM_N / 2)))
requires(array_abs_bound(a, 0, 3 * MLKEM_N, MLKEM_UINT12_LIMIT + 1))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
);
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k4 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(
int16_t r[256], const int16_t a[1024], const int16_t b[1024],
const int16_t b_cache[512])
/* This must be kept in sync with the HOL-Light specification in
* proofs/hol_light/arm/proofs/mlkem_poly_basemul_acc_montgomery_cached_k4.ml.
*/
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(memory_no_alias(a, sizeof(int16_t) * 4 * MLKEM_N))
requires(memory_no_alias(b, sizeof(int16_t) * 4 * MLKEM_N))
requires(memory_no_alias(b_cache, sizeof(int16_t) * 4 * (MLKEM_N / 2)))
requires(array_abs_bound(a, 0, 4 * MLKEM_N, MLKEM_UINT12_LIMIT + 1))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
);
#define mlk_rej_uniform_asm MLK_NAMESPACE(rej_uniform_asm)
uint64_t mlk_rej_uniform_asm(int16_t r[256], const uint8_t *buf,
unsigned buflen, const uint8_t table[2048])
/* This must be kept in sync with the HOL-Light specification
* in proofs/hol_light/arm/proofs/mlkem_rej_uniform.ml. */
__contract__(
requires(buflen % 24 == 0)
requires(memory_no_alias(buf, buflen))
requires(table == mlk_rej_uniform_table)
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
ensures(return_value <= MLKEM_N)
ensures(array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
);
#endif /* !MLK_NATIVE_AARCH64_SRC_ARITH_NATIVE_AARCH64_H */

View File

@@ -0,0 +1,612 @@
/* Copyright (c) 2022 Arm Limited
* Copyright (c) 2022 Hanno Becker
* Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [NeonNTT]
* Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
* Becker, Hwang, Kannwischer, Yang, Yang
* https://eprint.iacr.org/2021/986
*
* - [SLOTHY_Paper]
* Fast and Clean: Auditable high-performance assembly via constraint solving
* Abdulrahman, Becker, Kannwischer, Klein
* https://eprint.iacr.org/2022/1303
*/
/*yaml
Name: intt_asm
Description: AArch64 ML-KEM inverse NTT following @[NeonNTT] and @[SLOTHY_Paper]
Signature: void mlk_intt_asm(int16_t p[256], const int16_t twiddles12345[80], const int16_t twiddles56[384])
ABI:
x0:
type: buffer
size_bytes: 512
permissions: read/write
c_parameter: int16_t p[256]
description: Input/output polynomial
x1:
type: buffer
size_bytes: 160
permissions: read-only
c_parameter: const int16_t twiddles12345[80]
description: Twiddle factors for layers 1-5
x2:
type: buffer
size_bytes: 768
permissions: read-only
c_parameter: const int16_t twiddles56[384]
description: Twiddle factors for layers 6-7
Stack:
bytes: 64
description: saving callee-saved Neon registers
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/intt.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_intt_asm)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_intt_asm)
S2N_BN_SYMBOL(mlkem_intt_asm):
.cfi_startproc
sub sp, sp, #0x40
.cfi_adjust_cfa_offset 0x40
stp d8, d9, [sp]
.cfi_rel_offset d8, 0x0
.cfi_rel_offset d9, 0x8
stp d10, d11, [sp, #0x10]
.cfi_rel_offset d10, 0x10
.cfi_rel_offset d11, 0x18
stp d12, d13, [sp, #0x20]
.cfi_rel_offset d12, 0x20
.cfi_rel_offset d13, 0x28
stp d14, d15, [sp, #0x30]
.cfi_rel_offset d14, 0x30
.cfi_rel_offset d15, 0x38
mov w5, #0xd01 // =3329
mov v7.h[0], w5
mov w5, #0x4ebf // =20159
mov v7.h[1], w5
mov w5, #0x200 // =512
dup v29.8h, w5
mov w5, #0x13b0 // =5040
dup v30.8h, w5
mov x3, x0
mov x4, #0x8 // =8
Lintt_scale_start:
ldr q8, [x3]
ldr q9, [x3, #0x10]
ldr q10, [x3, #0x20]
ldr q11, [x3, #0x30]
sqrdmulh v27.8h, v8.8h, v30.8h
mul v8.8h, v8.8h, v29.8h
mls v8.8h, v27.8h, v7.h[0]
sqrdmulh v27.8h, v9.8h, v30.8h
mul v9.8h, v9.8h, v29.8h
mls v9.8h, v27.8h, v7.h[0]
sqrdmulh v27.8h, v10.8h, v30.8h
mul v10.8h, v10.8h, v29.8h
mls v10.8h, v27.8h, v7.h[0]
sqrdmulh v27.8h, v11.8h, v30.8h
mul v11.8h, v11.8h, v29.8h
mls v11.8h, v27.8h, v7.h[0]
str q8, [x3], #0x40
stur q9, [x3, #-0x30]
stur q10, [x3, #-0x20]
stur q11, [x3, #-0x10]
subs x4, x4, #0x1
cbnz x4, Lintt_scale_start
mov x3, x0
mov x4, #0x8 // =8
ldr q3, [x3, #0x10]
ldr q20, [x3]
ldr q25, [x3, #0x20]
ldr q24, [x3, #0x30]
ldr q21, [x2, #0x50]
trn1 v18.4s, v25.4s, v24.4s
trn1 v6.4s, v20.4s, v3.4s
trn2 v12.4s, v25.4s, v24.4s
trn2 v31.4s, v20.4s, v3.4s
trn2 v28.2d, v6.2d, v18.2d
trn1 v25.2d, v6.2d, v18.2d
trn2 v15.2d, v31.2d, v12.2d
trn1 v20.2d, v31.2d, v12.2d
add v4.8h, v28.8h, v15.8h
add v1.8h, v25.8h, v20.8h
sub v30.8h, v28.8h, v15.8h
sub v3.8h, v25.8h, v20.8h
add v6.8h, v1.8h, v4.8h
sqrdmulh v9.8h, v30.8h, v21.8h
ldr q21, [x2, #0x40]
ldr q25, [x2, #0x30]
mul v21.8h, v30.8h, v21.8h
ldr q30, [x2, #0x20]
sub v28.8h, v1.8h, v4.8h
ldr q1, [x2, #0x10]
mls v21.8h, v9.8h, v7.h[0]
sqrdmulh v9.8h, v3.8h, v25.8h
mul v20.8h, v3.8h, v30.8h
ldr q29, [x2], #0x60
ldr q17, [x3, #0x60]
mls v20.8h, v9.8h, v7.h[0]
ldr q3, [x3, #0x70]
mul v4.8h, v28.8h, v29.8h
sub v25.8h, v20.8h, v21.8h
trn1 v15.4s, v17.4s, v3.4s
sqrdmulh v28.8h, v28.8h, v1.8h
trn2 v31.4s, v17.4s, v3.4s
mul v30.8h, v25.8h, v29.8h
add v20.8h, v20.8h, v21.8h
mls v4.8h, v28.8h, v7.h[0]
sqrdmulh v3.8h, v25.8h, v1.8h
ldr q28, [x3, #0x40]
trn1 v25.4s, v6.4s, v20.4s
mls v30.8h, v3.8h, v7.h[0]
ldr q27, [x3, #0x50]
trn2 v6.4s, v6.4s, v20.4s
trn1 v3.4s, v4.4s, v30.4s
trn2 v10.4s, v28.4s, v27.4s
trn2 v20.4s, v4.4s, v30.4s
trn2 v8.2d, v25.2d, v3.2d
trn1 v9.2d, v25.2d, v3.2d
trn1 v1.2d, v6.2d, v20.2d
trn2 v30.2d, v6.2d, v20.2d
add v4.8h, v9.8h, v1.8h
add v11.8h, v8.8h, v30.8h
trn2 v25.2d, v10.2d, v31.2d
sqdmulh v6.8h, v4.8h, v7.h[1]
sqdmulh v20.8h, v11.8h, v7.h[1]
ldr q21, [x2, #0x50]
srshr v0.8h, v6.8h, #0xb
srshr v3.8h, v20.8h, #0xb
trn1 v2.4s, v28.4s, v27.4s
mls v4.8h, v0.8h, v7.h[0]
mls v11.8h, v3.8h, v7.h[0]
ldr q0, [x1], #0x10
trn2 v20.2d, v2.2d, v15.2d
sub v6.8h, v4.8h, v11.8h
sub v5.8h, v20.8h, v25.8h
sub v22.8h, v9.8h, v1.8h
sqrdmulh v3.8h, v6.8h, v0.h[1]
mul v6.8h, v6.8h, v0.h[0]
sqrdmulh v12.8h, v5.8h, v21.8h
ldr q19, [x2, #0x40]
mls v6.8h, v3.8h, v7.h[0]
ldr q14, [x2], #0x60
sub x4, x4, #0x2
Lintt_layer4567_start:
str q6, [x3, #0x20]
ldur q18, [x2, #-0x50]
mul v26.8h, v5.8h, v19.8h
trn1 v16.2d, v10.2d, v31.2d
mul v27.8h, v22.8h, v0.h[2]
trn1 v10.2d, v2.2d, v15.2d
add v5.8h, v4.8h, v11.8h
mls v26.8h, v12.8h, v7.h[0]
add v11.8h, v10.8h, v16.8h
add v6.8h, v20.8h, v25.8h
ldur q25, [x2, #-0x40]
ldur q28, [x2, #-0x30]
ldr q2, [x3, #0xa0]
ldr q19, [x2, #0x40]
sub v17.8h, v8.8h, v30.8h
ldr q1, [x3, #0x90]
sqrdmulh v9.8h, v17.8h, v0.h[5]
str q5, [x3], #0x40
ldr q30, [x3, #0x70]
sub v10.8h, v10.8h, v16.8h
ldr q16, [x3, #0x40]
sqrdmulh v24.8h, v10.8h, v28.8h
mul v13.8h, v10.8h, v25.8h
sub v21.8h, v11.8h, v6.8h
trn1 v15.4s, v2.4s, v30.4s
trn2 v31.4s, v2.4s, v30.4s
mls v13.8h, v24.8h, v7.h[0]
mul v29.8h, v21.8h, v14.8h
ldr q12, [x2, #0x50]
sub v28.8h, v13.8h, v26.8h
trn2 v10.4s, v16.4s, v1.4s
add v30.8h, v11.8h, v6.8h
sqrdmulh v2.8h, v28.8h, v18.8h
mul v8.8h, v28.8h, v14.8h
sqrdmulh v18.8h, v21.8h, v18.8h
ldr q14, [x2], #0x60
mls v8.8h, v2.8h, v7.h[0]
add v11.8h, v13.8h, v26.8h
mls v29.8h, v18.8h, v7.h[0]
sqrdmulh v20.8h, v22.8h, v0.h[3]
trn1 v23.4s, v30.4s, v11.4s
trn2 v28.4s, v30.4s, v11.4s
trn2 v13.4s, v29.4s, v8.4s
trn1 v11.4s, v29.4s, v8.4s
mls v27.8h, v20.8h, v7.h[0]
trn1 v21.2d, v28.2d, v13.2d
trn2 v8.2d, v23.2d, v11.2d
trn1 v24.2d, v23.2d, v11.2d
mul v26.8h, v17.8h, v0.h[4]
trn2 v30.2d, v28.2d, v13.2d
add v4.8h, v24.8h, v21.8h
add v11.8h, v8.8h, v30.8h
mls v26.8h, v9.8h, v7.h[0]
sqdmulh v17.8h, v4.8h, v7.h[1]
sqdmulh v29.8h, v11.8h, v7.h[1]
trn2 v25.2d, v10.2d, v31.2d
add v2.8h, v27.8h, v26.8h
srshr v28.8h, v17.8h, #0xb
srshr v13.8h, v29.8h, #0xb
sqdmulh v20.8h, v2.8h, v7.h[1]
sub v5.8h, v27.8h, v26.8h
mls v4.8h, v28.8h, v7.h[0]
mls v11.8h, v13.8h, v7.h[0]
srshr v23.8h, v20.8h, #0xb
sqrdmulh v17.8h, v5.8h, v0.h[1]
mul v9.8h, v5.8h, v0.h[0]
mls v2.8h, v23.8h, v7.h[0]
sub v29.8h, v4.8h, v11.8h
ldr q0, [x1], #0x10
stur q2, [x3, #-0x30]
trn1 v2.4s, v16.4s, v1.4s
sqrdmulh v3.8h, v29.8h, v0.h[1]
mul v6.8h, v29.8h, v0.h[0]
trn2 v20.2d, v2.2d, v15.2d
mls v9.8h, v17.8h, v7.h[0]
sub v5.8h, v20.8h, v25.8h
mls v6.8h, v3.8h, v7.h[0]
sub v22.8h, v24.8h, v21.8h
stur q9, [x3, #-0x10]
sqrdmulh v12.8h, v5.8h, v12.8h
subs x4, x4, #0x1
cbnz x4, Lintt_layer4567_start
mul v21.8h, v22.8h, v0.h[2]
mul v28.8h, v5.8h, v19.8h
trn1 v10.2d, v10.2d, v31.2d
trn1 v2.2d, v2.2d, v15.2d
add v11.8h, v4.8h, v11.8h
sub v30.8h, v8.8h, v30.8h
add v23.8h, v20.8h, v25.8h
add v24.8h, v2.8h, v10.8h
mul v8.8h, v30.8h, v0.h[4]
sqrdmulh v5.8h, v30.8h, v0.h[5]
sqrdmulh v22.8h, v22.8h, v0.h[3]
add v30.8h, v24.8h, v23.8h
ldur q26, [x2, #-0x30]
mls v8.8h, v5.8h, v7.h[0]
sub v5.8h, v2.8h, v10.8h
ldur q13, [x2, #-0x40]
mls v21.8h, v22.8h, v7.h[0]
str q6, [x3, #0x20]
mul v3.8h, v5.8h, v13.8h
sqrdmulh v22.8h, v5.8h, v26.8h
sub v18.8h, v21.8h, v8.8h
mls v28.8h, v12.8h, v7.h[0]
str q11, [x3], #0x40
mls v3.8h, v22.8h, v7.h[0]
sqrdmulh v16.8h, v18.8h, v0.h[1]
sub v10.8h, v24.8h, v23.8h
mul v17.8h, v18.8h, v0.h[0]
sub v11.8h, v3.8h, v28.8h
mul v13.8h, v10.8h, v14.8h
add v22.8h, v3.8h, v28.8h
mul v14.8h, v11.8h, v14.8h
ldur q26, [x2, #-0x50]
trn2 v2.4s, v30.4s, v22.4s
mls v17.8h, v16.8h, v7.h[0]
sqrdmulh v10.8h, v10.8h, v26.8h
sqrdmulh v11.8h, v11.8h, v26.8h
ldr q9, [x1], #0x10
mls v13.8h, v10.8h, v7.h[0]
mls v14.8h, v11.8h, v7.h[0]
trn1 v6.4s, v30.4s, v22.4s
add v8.8h, v21.8h, v8.8h
stur q17, [x3, #-0x10]
trn2 v0.4s, v13.4s, v14.4s
trn1 v1.4s, v13.4s, v14.4s
sqdmulh v13.8h, v8.8h, v7.h[1]
trn1 v24.2d, v2.2d, v0.2d
trn2 v2.2d, v2.2d, v0.2d
trn2 v26.2d, v6.2d, v1.2d
trn1 v11.2d, v6.2d, v1.2d
add v22.8h, v26.8h, v2.8h
sub v28.8h, v11.8h, v24.8h
sub v27.8h, v26.8h, v2.8h
add v10.8h, v11.8h, v24.8h
sqrdmulh v11.8h, v28.8h, v9.h[3]
mul v24.8h, v28.8h, v9.h[2]
sqdmulh v1.8h, v22.8h, v7.h[1]
sqrdmulh v0.8h, v27.8h, v9.h[5]
srshr v12.8h, v13.8h, #0xb
mls v24.8h, v11.8h, v7.h[0]
sqdmulh v14.8h, v10.8h, v7.h[1]
mul v27.8h, v27.8h, v9.h[4]
mls v8.8h, v12.8h, v7.h[0]
srshr v5.8h, v1.8h, #0xb
srshr v14.8h, v14.8h, #0xb
mls v27.8h, v0.8h, v7.h[0]
mls v22.8h, v5.8h, v7.h[0]
mls v10.8h, v14.8h, v7.h[0]
stur q8, [x3, #-0x30]
sub v2.8h, v24.8h, v27.8h
add v14.8h, v24.8h, v27.8h
sub v11.8h, v10.8h, v22.8h
add v20.8h, v10.8h, v22.8h
sqdmulh v22.8h, v14.8h, v7.h[1]
sqrdmulh v8.8h, v11.8h, v9.h[1]
mul v27.8h, v11.8h, v9.h[0]
sqrdmulh v0.8h, v2.8h, v9.h[1]
mul v11.8h, v2.8h, v9.h[0]
srshr v10.8h, v22.8h, #0xb
mls v27.8h, v8.8h, v7.h[0]
str q20, [x3], #0x40
mls v11.8h, v0.8h, v7.h[0]
mls v14.8h, v10.8h, v7.h[0]
stur q27, [x3, #-0x20]
stur q11, [x3, #-0x10]
stur q14, [x3, #-0x30]
mov x4, #0x4 // =4
ldr q0, [x1], #0x20
ldur q1, [x1, #-0x10]
ldr q2, [x0]
ldr q10, [x0, #0x40]
ldr q11, [x0, #0x80]
sub v14.8h, v2.8h, v10.8h
add v2.8h, v2.8h, v10.8h
ldr q10, [x0, #0xc0]
sqrdmulh v8.8h, v14.8h, v0.h[7]
mul v14.8h, v14.8h, v0.h[6]
sub v22.8h, v11.8h, v10.8h
add v10.8h, v11.8h, v10.8h
ldr q11, [x0, #0x1c0]
add v13.8h, v2.8h, v10.8h
sub v2.8h, v2.8h, v10.8h
sqrdmulh v10.8h, v22.8h, v1.h[1]
mul v22.8h, v22.8h, v1.h[0]
mls v14.8h, v8.8h, v7.h[0]
sqrdmulh v8.8h, v2.8h, v0.h[3]
mul v2.8h, v2.8h, v0.h[2]
mls v22.8h, v10.8h, v7.h[0]
ldr q10, [x0, #0x100]
mls v2.8h, v8.8h, v7.h[0]
sub v8.8h, v14.8h, v22.8h
add v14.8h, v14.8h, v22.8h
ldr q22, [x0, #0x180]
sqrdmulh v24.8h, v8.8h, v0.h[3]
mul v8.8h, v8.8h, v0.h[2]
sub v26.8h, v22.8h, v11.8h
add v11.8h, v22.8h, v11.8h
ldr q22, [x0, #0x140]
sqrdmulh v16.8h, v26.8h, v1.h[5]
mul v26.8h, v26.8h, v1.h[4]
add v23.8h, v10.8h, v22.8h
sub v10.8h, v10.8h, v22.8h
mls v8.8h, v24.8h, v7.h[0]
add v22.8h, v23.8h, v11.8h
mul v24.8h, v10.8h, v1.h[2]
sqrdmulh v10.8h, v10.8h, v1.h[3]
sub v19.8h, v13.8h, v22.8h
add v18.8h, v13.8h, v22.8h
sub v11.8h, v23.8h, v11.8h
mls v24.8h, v10.8h, v7.h[0]
mls v26.8h, v16.8h, v7.h[0]
sqrdmulh v10.8h, v11.8h, v0.h[5]
mul v11.8h, v11.8h, v0.h[4]
sqrdmulh v22.8h, v19.8h, v0.h[1]
sub v13.8h, v24.8h, v26.8h
mul v16.8h, v19.8h, v0.h[0]
mls v11.8h, v10.8h, v7.h[0]
sqrdmulh v10.8h, v13.8h, v0.h[5]
mul v13.8h, v13.8h, v0.h[4]
add v24.8h, v24.8h, v26.8h
sub v26.8h, v2.8h, v11.8h
add v9.8h, v2.8h, v11.8h
add v11.8h, v14.8h, v24.8h
sub v14.8h, v14.8h, v24.8h
sqrdmulh v2.8h, v26.8h, v0.h[1]
mul v24.8h, v26.8h, v0.h[0]
mls v13.8h, v10.8h, v7.h[0]
mls v16.8h, v22.8h, v7.h[0]
sqrdmulh v10.8h, v14.8h, v0.h[1]
mls v24.8h, v2.8h, v7.h[0]
add v22.8h, v8.8h, v13.8h
str q16, [x0, #0x100]
sub v2.8h, v8.8h, v13.8h
str q24, [x0, #0x180]
mul v13.8h, v14.8h, v0.h[0]
str q22, [x0, #0xc0]
sqrdmulh v21.8h, v2.8h, v0.h[1]
ldr q6, [x0, #0x90]
ldr q14, [x0, #0xd0]
mls v13.8h, v10.8h, v7.h[0]
str q11, [x0, #0x40]
sub v10.8h, v6.8h, v14.8h
ldr q11, [x0, #0x10]
sqrdmulh v19.8h, v10.8h, v1.h[1]
mul v20.8h, v10.8h, v1.h[0]
ldr q28, [x0, #0x50]
sub x4, x4, #0x2
Lintt_layer123_start:
mls v20.8h, v19.8h, v7.h[0]
ldr q31, [x0, #0x1d0]
sub v22.8h, v11.8h, v28.8h
ldr q30, [x0, #0x110]
sqrdmulh v8.8h, v22.8h, v0.h[7]
mul v3.8h, v22.8h, v0.h[6]
mul v5.8h, v2.8h, v0.h[0]
str q13, [x0, #0x140]
add v10.8h, v11.8h, v28.8h
ldr q22, [x0, #0x150]
ldr q4, [x0, #0x190]
sub v23.8h, v30.8h, v22.8h
add v27.8h, v30.8h, v22.8h
mls v3.8h, v8.8h, v7.h[0]
mls v5.8h, v21.8h, v7.h[0]
ldr q11, [x0, #0x20]
sub v17.8h, v4.8h, v31.8h
add v2.8h, v6.8h, v14.8h
mul v19.8h, v23.8h, v1.h[2]
sub v22.8h, v3.8h, v20.8h
add v14.8h, v10.8h, v2.8h
sub v24.8h, v10.8h, v2.8h
sqrdmulh v2.8h, v23.8h, v1.h[3]
sqrdmulh v30.8h, v22.8h, v0.h[3]
mul v23.8h, v22.8h, v0.h[2]
sqrdmulh v15.8h, v17.8h, v1.h[5]
mls v19.8h, v2.8h, v7.h[0]
add v2.8h, v4.8h, v31.8h
mul v21.8h, v17.8h, v1.h[4]
sqrdmulh v22.8h, v24.8h, v0.h[3]
sub v26.8h, v27.8h, v2.8h
add v8.8h, v27.8h, v2.8h
mul v28.8h, v24.8h, v0.h[2]
sqrdmulh v10.8h, v26.8h, v0.h[5]
mul v31.8h, v26.8h, v0.h[4]
mls v21.8h, v15.8h, v7.h[0]
mls v28.8h, v22.8h, v7.h[0]
sub v17.8h, v14.8h, v8.8h
mls v31.8h, v10.8h, v7.h[0]
sub v27.8h, v19.8h, v21.8h
sqrdmulh v29.8h, v17.8h, v0.h[1]
mul v10.8h, v17.8h, v0.h[0]
sub v15.8h, v28.8h, v31.8h
sqrdmulh v17.8h, v27.8h, v0.h[5]
mul v25.8h, v27.8h, v0.h[4]
sqrdmulh v6.8h, v15.8h, v0.h[1]
mul v27.8h, v15.8h, v0.h[0]
add v16.8h, v19.8h, v21.8h
mls v25.8h, v17.8h, v7.h[0]
mls v23.8h, v30.8h, v7.h[0]
mls v27.8h, v6.8h, v7.h[0]
ldr q6, [x0, #0xa0]
add v22.8h, v23.8h, v25.8h
str q27, [x0, #0x190]
add v4.8h, v3.8h, v20.8h
str q22, [x0, #0xd0]
mls v10.8h, v29.8h, v7.h[0]
str q5, [x0, #0x1c0]
add v20.8h, v4.8h, v16.8h
str q18, [x0], #0x10
sub v18.8h, v4.8h, v16.8h
str q10, [x0, #0x100]
sub v2.8h, v23.8h, v25.8h
sqrdmulh v12.8h, v18.8h, v0.h[1]
mul v13.8h, v18.8h, v0.h[0]
add v18.8h, v14.8h, v8.8h
ldr q14, [x0, #0xd0]
mls v13.8h, v12.8h, v7.h[0]
str q9, [x0, #0x70]
sub v3.8h, v6.8h, v14.8h
add v9.8h, v28.8h, v31.8h
str q20, [x0, #0x40]
sqrdmulh v19.8h, v3.8h, v1.h[1]
mul v20.8h, v3.8h, v1.h[0]
sqrdmulh v21.8h, v2.8h, v0.h[1]
ldr q28, [x0, #0x50]
subs x4, x4, #0x1
cbnz x4, Lintt_layer123_start
mls v20.8h, v19.8h, v7.h[0]
sub v10.8h, v11.8h, v28.8h
add v11.8h, v11.8h, v28.8h
mul v2.8h, v2.8h, v0.h[0]
str q13, [x0, #0x140]
add v25.8h, v6.8h, v14.8h
str q18, [x0], #0x10
sqrdmulh v17.8h, v10.8h, v0.h[7]
str q9, [x0, #0x70]
ldr q8, [x0, #0x1c0]
ldr q13, [x0, #0x100]
ldr q26, [x0, #0x180]
ldr q24, [x0, #0x140]
add v15.8h, v26.8h, v8.8h
sub v8.8h, v26.8h, v8.8h
sub v12.8h, v13.8h, v24.8h
add v24.8h, v13.8h, v24.8h
sqrdmulh v18.8h, v8.8h, v1.h[5]
mul v26.8h, v12.8h, v1.h[2]
mul v8.8h, v8.8h, v1.h[4]
sqrdmulh v16.8h, v12.8h, v1.h[3]
mul v10.8h, v10.8h, v0.h[6]
add v22.8h, v11.8h, v25.8h
mls v8.8h, v18.8h, v7.h[0]
mls v26.8h, v16.8h, v7.h[0]
mls v10.8h, v17.8h, v7.h[0]
add v23.8h, v24.8h, v15.8h
sub v11.8h, v11.8h, v25.8h
sub v3.8h, v26.8h, v8.8h
sub v14.8h, v10.8h, v20.8h
sub v19.8h, v22.8h, v23.8h
mul v18.8h, v3.8h, v0.h[4]
sqrdmulh v17.8h, v14.8h, v0.h[3]
mul v14.8h, v14.8h, v0.h[2]
sqrdmulh v3.8h, v3.8h, v0.h[5]
sub v16.8h, v24.8h, v15.8h
mls v2.8h, v21.8h, v7.h[0]
mls v14.8h, v17.8h, v7.h[0]
mls v18.8h, v3.8h, v7.h[0]
sqrdmulh v31.8h, v16.8h, v0.h[5]
str q2, [x0, #0x1b0]
mul v13.8h, v16.8h, v0.h[4]
add v24.8h, v14.8h, v18.8h
sqrdmulh v2.8h, v11.8h, v0.h[3]
mul v21.8h, v11.8h, v0.h[2]
mls v13.8h, v31.8h, v7.h[0]
add v16.8h, v26.8h, v8.8h
add v28.8h, v10.8h, v20.8h
mls v21.8h, v2.8h, v7.h[0]
sub v14.8h, v14.8h, v18.8h
add v2.8h, v28.8h, v16.8h
sub v10.8h, v28.8h, v16.8h
sub v16.8h, v21.8h, v13.8h
sqrdmulh v27.8h, v19.8h, v0.h[1]
mul v26.8h, v19.8h, v0.h[0]
sqrdmulh v19.8h, v16.8h, v0.h[1]
mul v28.8h, v16.8h, v0.h[0]
sqrdmulh v8.8h, v14.8h, v0.h[1]
mls v26.8h, v27.8h, v7.h[0]
mul v14.8h, v14.8h, v0.h[0]
mls v28.8h, v19.8h, v7.h[0]
sqrdmulh v20.8h, v10.8h, v0.h[1]
str q26, [x0, #0x100]
mul v10.8h, v10.8h, v0.h[0]
str q28, [x0, #0x180]
add v22.8h, v22.8h, v23.8h
str q24, [x0, #0xc0]
mls v10.8h, v20.8h, v7.h[0]
str q2, [x0, #0x40]
mls v14.8h, v8.8h, v7.h[0]
str q22, [x0], #0x10
add v11.8h, v21.8h, v13.8h
str q10, [x0, #0x130]
str q11, [x0, #0x70]
str q14, [x0, #0x1b0]
ldp d8, d9, [sp]
.cfi_restore d8
.cfi_restore d9
ldp d10, d11, [sp, #0x10]
.cfi_restore d10
.cfi_restore d11
ldp d12, d13, [sp, #0x20]
.cfi_restore d12
.cfi_restore d13
ldp d14, d15, [sp, #0x30]
.cfi_restore d14
.cfi_restore d15
add sp, sp, #0x40
.cfi_adjust_cfa_offset -0x40
ret
.cfi_endproc

View File

@@ -0,0 +1,409 @@
/* Copyright (c) 2022 Arm Limited
* Copyright (c) 2022 Hanno Becker
* Copyright (c) 2023 Amin Abdulrahman, Matthias Kannwischer
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [NeonNTT]
* Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
* Becker, Hwang, Kannwischer, Yang, Yang
* https://eprint.iacr.org/2021/986
*
* - [SLOTHY_Paper]
* Fast and Clean: Auditable high-performance assembly via constraint solving
* Abdulrahman, Becker, Kannwischer, Klein
* https://eprint.iacr.org/2022/1303
*/
/*yaml
Name: ntt_asm
Description: AArch64 ML-KEM forward NTT following @[NeonNTT] and @[SLOTHY_Paper]
Signature: void mlk_ntt_asm(int16_t p[256], const int16_t twiddles12345[80], const int16_t twiddles56[384])
ABI:
x0:
type: buffer
size_bytes: 512
permissions: read/write
c_parameter: int16_t p[256]
description: Input/output polynomial
x1:
type: buffer
size_bytes: 160
permissions: read-only
c_parameter: const int16_t twiddles12345[80]
description: Twiddle factors for layers 1-5
x2:
type: buffer
size_bytes: 768
permissions: read-only
c_parameter: const int16_t twiddles56[384]
description: Twiddle factors for layers 6-7
Stack:
bytes: 64
description: saving callee-saved Neon registers
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/ntt.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_ntt_asm)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_ntt_asm)
S2N_BN_SYMBOL(mlkem_ntt_asm):
.cfi_startproc
sub sp, sp, #0x40
.cfi_adjust_cfa_offset 0x40
stp d8, d9, [sp]
.cfi_rel_offset d8, 0x0
.cfi_rel_offset d9, 0x8
stp d10, d11, [sp, #0x10]
.cfi_rel_offset d10, 0x10
.cfi_rel_offset d11, 0x18
stp d12, d13, [sp, #0x20]
.cfi_rel_offset d12, 0x20
.cfi_rel_offset d13, 0x28
stp d14, d15, [sp, #0x30]
.cfi_rel_offset d14, 0x30
.cfi_rel_offset d15, 0x38
mov w5, #0xd01 // =3329
mov v7.h[0], w5
mov w5, #0x4ebf // =20159
mov v7.h[1], w5
mov x3, x0
mov x4, #0x4 // =4
ldr q0, [x1], #0x20
ldur q1, [x1, #-0x10]
ldr q5, [x0]
ldr q13, [x0, #0x40]
ldr q3, [x0, #0x80]
ldr q22, [x0, #0xc0]
ldr q24, [x0, #0x100]
ldr q11, [x0, #0x1c0]
mul v23.8h, v24.8h, v0.h[0]
ldr q2, [x0, #0x140]
mul v17.8h, v11.8h, v0.h[0]
ldr q19, [x0, #0x180]
sub x4, x4, #0x1
Lntt_layer123_start:
sqrdmulh v8.8h, v24.8h, v0.h[1]
sqrdmulh v24.8h, v2.8h, v0.h[1]
mul v2.8h, v2.8h, v0.h[0]
sqrdmulh v14.8h, v19.8h, v0.h[1]
mls v23.8h, v8.8h, v7.h[0]
mul v8.8h, v19.8h, v0.h[0]
mls v2.8h, v24.8h, v7.h[0]
sqrdmulh v24.8h, v11.8h, v0.h[1]
sub v11.8h, v5.8h, v23.8h
mls v8.8h, v14.8h, v7.h[0]
sub v14.8h, v13.8h, v2.8h
add v2.8h, v13.8h, v2.8h
add v23.8h, v5.8h, v23.8h
sub v19.8h, v3.8h, v8.8h
add v8.8h, v3.8h, v8.8h
mls v17.8h, v24.8h, v7.h[0]
sqrdmulh v24.8h, v19.8h, v0.h[5]
mul v19.8h, v19.8h, v0.h[4]
sqrdmulh v5.8h, v8.8h, v0.h[3]
sub v13.8h, v22.8h, v17.8h
add v17.8h, v22.8h, v17.8h
mls v19.8h, v24.8h, v7.h[0]
sqrdmulh v24.8h, v13.8h, v0.h[5]
mul v13.8h, v13.8h, v0.h[4]
mul v8.8h, v8.8h, v0.h[2]
sub v3.8h, v11.8h, v19.8h
add v11.8h, v11.8h, v19.8h
mls v13.8h, v24.8h, v7.h[0]
sqrdmulh v24.8h, v17.8h, v0.h[3]
mul v19.8h, v17.8h, v0.h[2]
mls v8.8h, v5.8h, v7.h[0]
sub v17.8h, v14.8h, v13.8h
add v14.8h, v14.8h, v13.8h
mls v19.8h, v24.8h, v7.h[0]
sub v24.8h, v23.8h, v8.8h
add v8.8h, v23.8h, v8.8h
sqrdmulh v23.8h, v14.8h, v1.h[3]
sub v5.8h, v2.8h, v19.8h
add v2.8h, v2.8h, v19.8h
mul v14.8h, v14.8h, v1.h[2]
sqrdmulh v19.8h, v5.8h, v1.h[1]
sqrdmulh v13.8h, v2.8h, v0.h[7]
mul v2.8h, v2.8h, v0.h[6]
mul v5.8h, v5.8h, v1.h[0]
mls v14.8h, v23.8h, v7.h[0]
sqrdmulh v23.8h, v17.8h, v1.h[5]
mls v2.8h, v13.8h, v7.h[0]
mls v5.8h, v19.8h, v7.h[0]
sub v19.8h, v11.8h, v14.8h
add v14.8h, v11.8h, v14.8h
sub v11.8h, v8.8h, v2.8h
mul v17.8h, v17.8h, v1.h[4]
add v8.8h, v8.8h, v2.8h
sub v2.8h, v24.8h, v5.8h
add v24.8h, v24.8h, v5.8h
mls v17.8h, v23.8h, v7.h[0]
str q8, [x0], #0x10
ldr q5, [x0]
sub v8.8h, v3.8h, v17.8h
add v23.8h, v3.8h, v17.8h
str q11, [x0, #0x30]
ldr q13, [x0, #0x40]
str q24, [x0, #0x70]
ldr q3, [x0, #0x80]
str q2, [x0, #0xb0]
ldr q22, [x0, #0xc0]
str q14, [x0, #0xf0]
ldr q24, [x0, #0x100]
str q19, [x0, #0x130]
ldr q2, [x0, #0x140]
str q23, [x0, #0x170]
mul v23.8h, v24.8h, v0.h[0]
str q8, [x0, #0x1b0]
ldr q11, [x0, #0x1c0]
ldr q19, [x0, #0x180]
mul v17.8h, v11.8h, v0.h[0]
subs x4, x4, #0x1
cbnz x4, Lntt_layer123_start
sqrdmulh v6.8h, v11.8h, v0.h[1]
mul v25.8h, v19.8h, v0.h[0]
sqrdmulh v12.8h, v19.8h, v0.h[1]
mul v11.8h, v2.8h, v0.h[0]
mls v17.8h, v6.8h, v7.h[0]
sqrdmulh v14.8h, v2.8h, v0.h[1]
mls v25.8h, v12.8h, v7.h[0]
sqrdmulh v27.8h, v24.8h, v0.h[1]
add v9.8h, v22.8h, v17.8h
mls v11.8h, v14.8h, v7.h[0]
sub v26.8h, v3.8h, v25.8h
sqrdmulh v2.8h, v9.8h, v0.h[3]
mul v24.8h, v9.8h, v0.h[2]
mul v19.8h, v26.8h, v0.h[4]
sqrdmulh v14.8h, v26.8h, v0.h[5]
mls v23.8h, v27.8h, v7.h[0]
mls v24.8h, v2.8h, v7.h[0]
add v6.8h, v13.8h, v11.8h
mls v19.8h, v14.8h, v7.h[0]
sub v4.8h, v5.8h, v23.8h
add v10.8h, v3.8h, v25.8h
sub v8.8h, v6.8h, v24.8h
add v3.8h, v4.8h, v19.8h
sub v31.8h, v4.8h, v19.8h
mul v14.8h, v8.8h, v1.h[0]
sqrdmulh v4.8h, v10.8h, v0.h[3]
mul v12.8h, v10.8h, v0.h[2]
sqrdmulh v2.8h, v8.8h, v1.h[1]
sub v8.8h, v22.8h, v17.8h
add v30.8h, v5.8h, v23.8h
mls v12.8h, v4.8h, v7.h[0]
sqrdmulh v4.8h, v8.8h, v0.h[5]
mul v19.8h, v8.8h, v0.h[4]
mls v14.8h, v2.8h, v7.h[0]
sub v27.8h, v30.8h, v12.8h
sub v23.8h, v13.8h, v11.8h
mls v19.8h, v4.8h, v7.h[0]
sub v2.8h, v27.8h, v14.8h
add v8.8h, v27.8h, v14.8h
add v14.8h, v6.8h, v24.8h
str q2, [x0, #0xc0]
add v2.8h, v23.8h, v19.8h
str q8, [x0, #0x80]
sub v19.8h, v23.8h, v19.8h
sqrdmulh v13.8h, v2.8h, v1.h[3]
mul v17.8h, v2.8h, v1.h[2]
add v27.8h, v30.8h, v12.8h
sqrdmulh v24.8h, v19.8h, v1.h[5]
mul v19.8h, v19.8h, v1.h[4]
mls v17.8h, v13.8h, v7.h[0]
sqrdmulh v8.8h, v14.8h, v0.h[7]
mul v2.8h, v14.8h, v0.h[6]
mls v19.8h, v24.8h, v7.h[0]
add v26.8h, v3.8h, v17.8h
sub v14.8h, v3.8h, v17.8h
mls v2.8h, v8.8h, v7.h[0]
str q26, [x0, #0x100]
add v8.8h, v31.8h, v19.8h
str q14, [x0, #0x140]
sub v24.8h, v31.8h, v19.8h
str q8, [x0, #0x180]
add v18.8h, v27.8h, v2.8h
str q24, [x0, #0x1c0]
sub v14.8h, v27.8h, v2.8h
str q18, [x0], #0x10
str q14, [x0, #0x30]
mov x0, x3
mov x4, #0x8 // =8
ldr q11, [x1], #0x10
ldr q24, [x0, #0x30]
ldr q8, [x0, #0x20]
sqrdmulh v14.8h, v24.8h, v11.h[1]
mul v2.8h, v24.8h, v11.h[0]
sqrdmulh v9.8h, v8.8h, v11.h[1]
ldr q24, [x0, #0x10]
mls v2.8h, v14.8h, v7.h[0]
mul v14.8h, v8.8h, v11.h[0]
ldr q6, [x2, #0x40]
sub v8.8h, v24.8h, v2.8h
mls v14.8h, v9.8h, v7.h[0]
add v2.8h, v24.8h, v2.8h
mul v27.8h, v8.8h, v11.h[4]
sqrdmulh v8.8h, v8.8h, v11.h[5]
mul v24.8h, v2.8h, v11.h[2]
sqrdmulh v11.8h, v2.8h, v11.h[3]
mls v27.8h, v8.8h, v7.h[0]
ldr q5, [x2, #0x50]
sub x4, x4, #0x1
Lntt_layer4567_start:
ldr q8, [x0]
ldr q17, [x2, #0x10]
sub v1.8h, v8.8h, v14.8h
mls v24.8h, v11.8h, v7.h[0]
add v8.8h, v8.8h, v14.8h
sub v0.8h, v1.8h, v27.8h
add v12.8h, v1.8h, v27.8h
sub v19.8h, v8.8h, v24.8h
add v8.8h, v8.8h, v24.8h
trn1 v24.4s, v12.4s, v0.4s
trn2 v13.4s, v12.4s, v0.4s
trn1 v23.4s, v8.4s, v19.4s
ldr q2, [x2], #0x60
trn2 v9.2d, v23.2d, v24.2d
trn2 v8.4s, v8.4s, v19.4s
sqrdmulh v26.8h, v9.8h, v17.8h
trn1 v24.2d, v23.2d, v24.2d
trn2 v11.2d, v8.2d, v13.2d
trn1 v29.2d, v8.2d, v13.2d
sqrdmulh v23.8h, v11.8h, v17.8h
mul v10.8h, v11.8h, v2.8h
mul v0.8h, v9.8h, v2.8h
ldur q11, [x2, #-0x40]
mls v10.8h, v23.8h, v7.h[0]
mls v0.8h, v26.8h, v7.h[0]
ldur q19, [x2, #-0x30]
add v17.8h, v29.8h, v10.8h
sub v23.8h, v24.8h, v0.8h
sub v30.8h, v29.8h, v10.8h
mul v2.8h, v17.8h, v11.8h
sqrdmulh v11.8h, v17.8h, v19.8h
mul v8.8h, v30.8h, v6.8h
ldr q22, [x0, #0x70]
mls v2.8h, v11.8h, v7.h[0]
add v24.8h, v24.8h, v0.8h
ldr q15, [x1], #0x10
sub v14.8h, v24.8h, v2.8h
add v24.8h, v24.8h, v2.8h
sqrdmulh v1.8h, v22.8h, v15.h[1]
mul v2.8h, v22.8h, v15.h[0]
trn1 v0.4s, v24.4s, v14.4s
trn2 v24.4s, v24.4s, v14.4s
sqrdmulh v19.8h, v30.8h, v5.8h
mls v2.8h, v1.8h, v7.h[0]
ldr q16, [x0, #0x60]
mls v8.8h, v19.8h, v7.h[0]
ldr q6, [x2, #0x40]
mul v14.8h, v16.8h, v15.h[0]
sub v3.8h, v23.8h, v8.8h
add v8.8h, v23.8h, v8.8h
ldr q5, [x2, #0x50]
trn2 v23.4s, v8.4s, v3.4s
trn1 v31.4s, v8.4s, v3.4s
sqrdmulh v8.8h, v16.8h, v15.h[1]
trn2 v25.2d, v24.2d, v23.2d
trn1 v29.2d, v24.2d, v23.2d
ldr q24, [x0, #0x50]
trn1 v16.2d, v0.2d, v31.2d
mls v14.8h, v8.8h, v7.h[0]
sub v13.8h, v24.8h, v2.8h
add v24.8h, v24.8h, v2.8h
trn2 v2.2d, v0.2d, v31.2d
sqrdmulh v19.8h, v13.8h, v15.h[5]
str q2, [x0, #0x20]
sqrdmulh v11.8h, v24.8h, v15.h[3]
str q16, [x0], #0x40
mul v27.8h, v13.8h, v15.h[4]
stur q29, [x0, #-0x30]
mul v24.8h, v24.8h, v15.h[2]
stur q25, [x0, #-0x10]
mls v27.8h, v19.8h, v7.h[0]
subs x4, x4, #0x1
cbnz x4, Lntt_layer4567_start
ldr q23, [x0]
ldr q17, [x2], #0x60
sub v19.8h, v23.8h, v14.8h
mls v24.8h, v11.8h, v7.h[0]
add v14.8h, v23.8h, v14.8h
add v8.8h, v19.8h, v27.8h
sub v13.8h, v19.8h, v27.8h
add v12.8h, v14.8h, v24.8h
sub v24.8h, v14.8h, v24.8h
trn1 v0.4s, v8.4s, v13.4s
trn2 v23.4s, v8.4s, v13.4s
trn2 v19.4s, v12.4s, v24.4s
ldur q27, [x2, #-0x50]
trn2 v8.2d, v19.2d, v23.2d
trn1 v22.4s, v12.4s, v24.4s
mul v14.8h, v8.8h, v17.8h
sqrdmulh v24.8h, v8.8h, v27.8h
trn2 v2.2d, v22.2d, v0.2d
trn1 v8.2d, v19.2d, v23.2d
mul v11.8h, v2.8h, v17.8h
mls v14.8h, v24.8h, v7.h[0]
ldur q26, [x2, #-0x30]
sqrdmulh v23.8h, v2.8h, v27.8h
sub v24.8h, v8.8h, v14.8h
ldur q2, [x2, #-0x40]
sqrdmulh v19.8h, v24.8h, v5.8h
add v14.8h, v8.8h, v14.8h
mul v24.8h, v24.8h, v6.8h
mls v11.8h, v23.8h, v7.h[0]
sqrdmulh v8.8h, v14.8h, v26.8h
mul v2.8h, v14.8h, v2.8h
trn1 v14.2d, v22.2d, v0.2d
mls v24.8h, v19.8h, v7.h[0]
sub v23.8h, v14.8h, v11.8h
mls v2.8h, v8.8h, v7.h[0]
add v14.8h, v14.8h, v11.8h
add v8.8h, v23.8h, v24.8h
sub v24.8h, v23.8h, v24.8h
sub v19.8h, v14.8h, v2.8h
add v11.8h, v14.8h, v2.8h
trn1 v2.4s, v8.4s, v24.4s
trn2 v14.4s, v8.4s, v24.4s
trn2 v23.4s, v11.4s, v19.4s
trn1 v11.4s, v11.4s, v19.4s
trn2 v8.2d, v23.2d, v14.2d
trn1 v24.2d, v11.2d, v2.2d
str q8, [x0, #0x30]
trn2 v8.2d, v11.2d, v2.2d
str q24, [x0], #0x40
trn1 v24.2d, v23.2d, v14.2d
stur q8, [x0, #-0x20]
stur q24, [x0, #-0x30]
ldp d8, d9, [sp]
.cfi_restore d8
.cfi_restore d9
ldp d10, d11, [sp, #0x10]
.cfi_restore d10
.cfi_restore d11
ldp d12, d13, [sp, #0x20]
.cfi_restore d12
.cfi_restore d13
ldp d14, d15, [sp, #0x30]
.cfi_restore d14
.cfi_restore d15
add sp, sp, #0x40
.cfi_adjust_cfa_offset -0x40
ret
.cfi_endproc

View File

@@ -0,0 +1,84 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*yaml
Name: poly_mulcache_compute_asm
Description: Compute multiplication cache for polynomial
Signature: void mlk_poly_mulcache_compute_asm(int16_t cache[128], const int16_t mlk_poly[256], const int16_t zetas[128], const int16_t zetas_twisted[128])
ABI:
x0:
type: buffer
size_bytes: 256
permissions: write-only
c_parameter: int16_t cache[128]
description: Output cache
x1:
type: buffer
size_bytes: 512
permissions: read-only
c_parameter: const int16_t mlk_poly[256]
description: Input polynomial
x2:
type: buffer
size_bytes: 256
permissions: read-only
c_parameter: const int16_t zetas[128]
description: Zeta values
x3:
type: buffer
size_bytes: 256
permissions: read-only
c_parameter: const int16_t zetas_twisted[128]
description: Twisted zeta values
Stack:
bytes: 0
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/poly_mulcache_compute_asm.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_poly_mulcache_compute_asm)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_poly_mulcache_compute_asm)
S2N_BN_SYMBOL(mlkem_poly_mulcache_compute_asm):
.cfi_startproc
mov w5, #0xd01 // =3329
dup v6.8h, w5
mov w5, #0x4ebf // =20159
dup v7.8h, w5
mov x4, #0x10 // =16
ldr q1, [x1, #0x10]
ldr q27, [x1], #0x20
ldr q23, [x2], #0x10
uzp2 v27.8h, v27.8h, v1.8h
ldr q1, [x3], #0x10
mul v2.8h, v27.8h, v23.8h
sqrdmulh v27.8h, v27.8h, v1.8h
sub x4, x4, #0x1
Lpoly_mulcache_compute_loop:
ldr q29, [x1, #0x10]
ldr q21, [x2], #0x10
mls v2.8h, v27.8h, v6.h[0]
ldr q27, [x1], #0x20
ldr q7, [x3], #0x10
uzp2 v28.8h, v27.8h, v29.8h
str q2, [x0], #0x10
mul v2.8h, v28.8h, v21.8h
sqrdmulh v27.8h, v28.8h, v7.8h
sub x4, x4, #0x1
cbnz x4, Lpoly_mulcache_compute_loop
mls v2.8h, v27.8h, v6.h[0]
str q2, [x0], #0x10
ret
.cfi_endproc

View File

@@ -0,0 +1,112 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*yaml
Name: poly_reduce_asm
Description: Barrett reduction of polynomial coefficients
Signature: void mlk_poly_reduce_asm(int16_t p[256])
ABI:
x0:
type: buffer
size_bytes: 512
permissions: read/write
c_parameter: int16_t p[256]
description: Input/output polynomial
Stack:
bytes: 0
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/poly_reduce_asm.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_poly_reduce_asm)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_poly_reduce_asm)
S2N_BN_SYMBOL(mlkem_poly_reduce_asm):
.cfi_startproc
mov w2, #0xd01 // =3329
dup v3.8h, w2
mov w2, #0x4ebf // =20159
dup v4.8h, w2
mov x1, #0x8 // =8
ldr q21, [x0, #0x20]
ldr q23, [x0, #0x30]
sqdmulh v7.8h, v21.8h, v4.h[0]
sqdmulh v30.8h, v23.8h, v4.h[0]
srshr v7.8h, v7.8h, #0xb
srshr v30.8h, v30.8h, #0xb
mls v21.8h, v7.8h, v3.h[0]
mls v23.8h, v30.8h, v3.h[0]
ldr q5, [x0, #0x10]
sshr v7.8h, v21.8h, #0xf
sshr v30.8h, v23.8h, #0xf
and v7.16b, v3.16b, v7.16b
add v21.8h, v21.8h, v7.8h
and v7.16b, v3.16b, v30.16b
add v16.8h, v23.8h, v7.8h
sub x1, x1, #0x1
Lpoly_reduce_loop:
ldr q6, [x0], #0x40
ldr q30, [x0, #0x20]
sqdmulh v31.8h, v6.8h, v4.h[0]
sqdmulh v29.8h, v5.8h, v4.h[0]
sqdmulh v22.8h, v30.8h, v4.h[0]
stur q16, [x0, #-0x10]
srshr v20.8h, v31.8h, #0xb
srshr v28.8h, v29.8h, #0xb
stur q21, [x0, #-0x20]
mls v6.8h, v20.8h, v3.h[0]
mls v5.8h, v28.8h, v3.h[0]
ldr q2, [x0, #0x30]
sshr v31.8h, v6.8h, #0xf
srshr v19.8h, v22.8h, #0xb
and v22.16b, v3.16b, v31.16b
add v0.8h, v6.8h, v22.8h
mls v30.8h, v19.8h, v3.h[0]
sshr v26.8h, v5.8h, #0xf
sqdmulh v25.8h, v2.8h, v4.h[0]
and v17.16b, v3.16b, v26.16b
add v1.8h, v5.8h, v17.8h
sshr v31.8h, v30.8h, #0xf
srshr v25.8h, v25.8h, #0xb
stur q1, [x0, #-0x30]
and v18.16b, v3.16b, v31.16b
mls v2.8h, v25.8h, v3.h[0]
add v21.8h, v30.8h, v18.8h
ldr q5, [x0, #0x10]
sshr v18.8h, v2.8h, #0xf
stur q0, [x0, #-0x40]
and v27.16b, v3.16b, v18.16b
add v16.8h, v2.8h, v27.8h
sub x1, x1, #0x1
cbnz x1, Lpoly_reduce_loop
sqdmulh v20.8h, v5.8h, v4.h[0]
ldr q24, [x0], #0x40
stur q21, [x0, #-0x20]
srshr v20.8h, v20.8h, #0xb
sqdmulh v25.8h, v24.8h, v4.h[0]
stur q16, [x0, #-0x10]
mls v5.8h, v20.8h, v3.h[0]
srshr v20.8h, v25.8h, #0xb
sshr v2.8h, v5.8h, #0xf
mls v24.8h, v20.8h, v3.h[0]
and v20.16b, v3.16b, v2.16b
add v31.8h, v5.8h, v20.8h
sshr v20.8h, v24.8h, #0xf
stur q31, [x0, #-0x30]
and v31.16b, v3.16b, v20.16b
add v24.8h, v24.8h, v31.8h
stur q24, [x0, #-0x40]
ret
.cfi_endproc

View File

@@ -0,0 +1,131 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*yaml
Name: poly_tobytes_asm
Description: Convert polynomial to byte representation
Signature: void mlk_poly_tobytes_asm(uint8_t r[384], const int16_t a[256])
ABI:
x0:
type: buffer
size_bytes: 384
permissions: write-only
c_parameter: uint8_t r[384]
description: Output byte array
x1:
type: buffer
size_bytes: 512
permissions: read-only
c_parameter: const int16_t a[256]
description: Input polynomial
Stack:
bytes: 0
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/poly_tobytes_asm.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_poly_tobytes_asm)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_poly_tobytes_asm)
S2N_BN_SYMBOL(mlkem_poly_tobytes_asm):
.cfi_startproc
mov x2, #0x10 // =16
ldr q6, [x1], #0x20
ldur q24, [x1, #-0x10]
ldr q30, [x1], #0x20
ldur q22, [x1, #-0x10]
ldr q5, [x1], #0x20
ldur q17, [x1, #-0x10]
ldr q19, [x1], #0x20
ldur q4, [x1, #-0x10]
lsr x2, x2, #2
sub x2, x2, #0x1
Lpoly_tobytes_loop_start:
uzp1 v25.8h, v6.8h, v24.8h
uzp2 v6.8h, v6.8h, v24.8h
xtn v24.8b, v25.8h
shrn v25.8b, v25.8h, #0x8
xtn v18.8b, v6.8h
shrn v26.8b, v6.8h, #0x4
sli v25.8b, v18.8b, #0x4
st3 { v24.8b, v25.8b, v26.8b }, [x0], #24
uzp1 v25.8h, v30.8h, v22.8h
uzp2 v6.8h, v30.8h, v22.8h
xtn v24.8b, v25.8h
xtn v18.8b, v6.8h
uzp1 v30.8h, v5.8h, v17.8h
uzp2 v22.8h, v5.8h, v17.8h
xtn v5.8b, v30.8h
xtn v17.8b, v22.8h
uzp1 v28.8h, v19.8h, v4.8h
uzp2 v19.8h, v19.8h, v4.8h
xtn v4.8b, v28.8h
xtn v20.8b, v19.8h
shrn v25.8b, v25.8h, #0x8
sli v25.8b, v18.8b, #0x4
shrn v26.8b, v6.8h, #0x4
st3 { v24.8b, v25.8b, v26.8b }, [x0], #24
shrn v6.8b, v30.8h, #0x8
sli v6.8b, v17.8b, #0x4
shrn v7.8b, v22.8h, #0x4
st3 { v5.8b, v6.8b, v7.8b }, [x0], #24
shrn v5.8b, v28.8h, #0x8
shrn v6.8b, v19.8h, #0x4
sli v5.8b, v20.8b, #0x4
st3 { v4.8b, v5.8b, v6.8b }, [x0], #24
ldr q6, [x1], #0x20
ldur q24, [x1, #-0x10]
ldr q30, [x1], #0x20
ldur q22, [x1, #-0x10]
ldr q5, [x1], #0x20
ldur q17, [x1, #-0x10]
ldr q19, [x1], #0x20
ldur q4, [x1, #-0x10]
sub x2, x2, #0x1
cbnz x2, Lpoly_tobytes_loop_start
uzp1 v25.8h, v30.8h, v22.8h
uzp2 v18.8h, v30.8h, v22.8h
uzp1 v30.8h, v6.8h, v24.8h
uzp2 v6.8h, v6.8h, v24.8h
uzp1 v24.8h, v5.8h, v17.8h
uzp2 v22.8h, v5.8h, v17.8h
uzp1 v5.8h, v19.8h, v4.8h
uzp2 v17.8h, v19.8h, v4.8h
xtn v19.8b, v25.8h
shrn v20.8b, v25.8h, #0x8
xtn v25.8b, v18.8h
shrn v21.8b, v18.8h, #0x4
xtn v28.8b, v30.8h
shrn v29.8b, v30.8h, #0x8
xtn v18.8b, v6.8h
shrn v30.8b, v6.8h, #0x4
xtn v1.8b, v24.8h
shrn v2.8b, v24.8h, #0x8
xtn v6.8b, v22.8h
shrn v3.8b, v22.8h, #0x4
xtn v22.8b, v5.8h
shrn v23.8b, v5.8h, #0x8
xtn v5.8b, v17.8h
shrn v24.8b, v17.8h, #0x4
sli v20.8b, v25.8b, #0x4
sli v29.8b, v18.8b, #0x4
st3 { v28.8b, v29.8b, v30.8b }, [x0], #24
st3 { v19.8b, v20.8b, v21.8b }, [x0], #24
sli v2.8b, v6.8b, #0x4
st3 { v1.8b, v2.8b, v3.8b }, [x0], #24
sli v23.8b, v5.8b, #0x4
st3 { v22.8b, v23.8b, v24.8b }, [x0], #24
ret
.cfi_endproc

View File

@@ -0,0 +1,92 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*yaml
Name: poly_tomont_asm
Description: Convert polynomial to Montgomery domain
Signature: void mlk_poly_tomont_asm(int16_t p[256])
ABI:
x0:
type: buffer
size_bytes: 512
permissions: read/write
c_parameter: int16_t p[256]
description: Input/output polynomial
Stack:
bytes: 0
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/poly_tomont_asm.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_poly_tomont_asm)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_poly_tomont_asm)
S2N_BN_SYMBOL(mlkem_poly_tomont_asm):
.cfi_startproc
mov w2, #0xd01 // =3329
dup v4.8h, w2
mov w2, #0x4ebf // =20159
dup v5.8h, w2
mov w2, #-0x414 // =-1044
dup v2.8h, w2
mov w2, #-0x2824 // =-10276
dup v3.8h, w2
mov x1, #0x8 // =8
ldr q26, [x0, #0x30]
ldr q23, [x0, #0x10]
mul v17.8h, v26.8h, v2.8h
sqrdmulh v7.8h, v26.8h, v3.8h
ldr q27, [x0, #0x20]
sub x1, x1, #0x1
Lpoly_tomont_loop:
mls v17.8h, v7.8h, v4.h[0]
sqrdmulh v5.8h, v23.8h, v3.8h
ldr q7, [x0], #0x40
stur q17, [x0, #-0x10]
sqrdmulh v29.8h, v27.8h, v3.8h
sqrdmulh v19.8h, v7.8h, v3.8h
mul v25.8h, v23.8h, v2.8h
mul v0.8h, v7.8h, v2.8h
mul v26.8h, v27.8h, v2.8h
ldr q7, [x0, #0x30]
mls v25.8h, v5.8h, v4.h[0]
ldr q23, [x0, #0x10]
mls v26.8h, v29.8h, v4.h[0]
mls v0.8h, v19.8h, v4.h[0]
stur q25, [x0, #-0x30]
mul v17.8h, v7.8h, v2.8h
sqrdmulh v7.8h, v7.8h, v3.8h
stur q0, [x0, #-0x40]
ldr q27, [x0, #0x20]
stur q26, [x0, #-0x20]
sub x1, x1, #0x1
cbnz x1, Lpoly_tomont_loop
mls v17.8h, v7.8h, v4.h[0]
sqrdmulh v7.8h, v23.8h, v3.8h
mul v26.8h, v23.8h, v2.8h
sqrdmulh v25.8h, v27.8h, v3.8h
ldr q23, [x0], #0x40
mul v27.8h, v27.8h, v2.8h
mls v26.8h, v7.8h, v4.h[0]
sqrdmulh v7.8h, v23.8h, v3.8h
mul v23.8h, v23.8h, v2.8h
stur q17, [x0, #-0x10]
mls v27.8h, v25.8h, v4.h[0]
stur q26, [x0, #-0x30]
mls v23.8h, v7.8h, v4.h[0]
stur q27, [x0, #-0x20]
stur q23, [x0, #-0x40]
ret
.cfi_endproc

View File

@@ -0,0 +1,251 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [NeonNTT]
* Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
* Becker, Hwang, Kannwischer, Yang, Yang
* https://eprint.iacr.org/2021/986
*/
/*yaml
Name: polyvec_basemul_acc_montgomery_cached_asm_k2
Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=2
Signature: void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(int16_t r[256], const int16_t a[512], const int16_t b[512], const int16_t b_cache[256])
ABI:
x0:
type: buffer
size_bytes: 512
permissions: write-only
c_parameter: int16_t r[256]
description: Output polynomial
x1:
type: buffer
size_bytes: 1024
permissions: read-only
c_parameter: const int16_t a[512]
description: Input polynomial vector a
x2:
type: buffer
size_bytes: 1024
permissions: read-only
c_parameter: const int16_t b[512]
description: Input polynomial vector b
x3:
type: buffer
size_bytes: 512
permissions: read-only
c_parameter: const int16_t b_cache[256]
description: Cached values for b
Stack:
bytes: 64
description: saving callee-saved Neon registers
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k2)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k2)
S2N_BN_SYMBOL(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k2):
.cfi_startproc
sub sp, sp, #0x40
.cfi_adjust_cfa_offset 0x40
stp d8, d9, [sp]
.cfi_rel_offset d8, 0x0
.cfi_rel_offset d9, 0x8
stp d10, d11, [sp, #0x10]
.cfi_rel_offset d10, 0x10
.cfi_rel_offset d11, 0x18
stp d12, d13, [sp, #0x20]
.cfi_rel_offset d12, 0x20
.cfi_rel_offset d13, 0x28
stp d14, d15, [sp, #0x30]
.cfi_rel_offset d14, 0x30
.cfi_rel_offset d15, 0x38
mov w14, #0xd01 // =3329
dup v0.8h, w14
mov w14, #0xcff // =3327
dup v2.8h, w14
add x4, x1, #0x200
add x5, x2, #0x200
add x6, x3, #0x100
mov x13, #0x10 // =16
ldr q9, [x4], #0x20
ldur q5, [x4, #-0x10]
ldr q11, [x5], #0x20
uzp1 v23.8h, v9.8h, v5.8h
uzp2 v9.8h, v9.8h, v5.8h
ldr q5, [x2], #0x20
ldur q7, [x5, #-0x10]
ldur q21, [x2, #-0x10]
uzp2 v10.8h, v11.8h, v7.8h
uzp1 v11.8h, v11.8h, v7.8h
uzp1 v7.8h, v5.8h, v21.8h
uzp2 v5.8h, v5.8h, v21.8h
ldr q21, [x1], #0x20
ldur q25, [x1, #-0x10]
ld1 { v6.8h }, [x3], #16
uzp1 v26.8h, v21.8h, v25.8h
uzp2 v21.8h, v21.8h, v25.8h
smull v25.4s, v26.4h, v5.4h
smull2 v5.4s, v26.8h, v5.8h
smull v19.4s, v26.4h, v7.4h
smull2 v26.4s, v26.8h, v7.8h
smlal v25.4s, v21.4h, v7.4h
smlal2 v5.4s, v21.8h, v7.8h
smlal v19.4s, v21.4h, v6.4h
smlal2 v26.4s, v21.8h, v6.8h
smlal v25.4s, v23.4h, v10.4h
smlal2 v5.4s, v23.8h, v10.8h
smlal v19.4s, v23.4h, v11.4h
smlal2 v26.4s, v23.8h, v11.8h
ld1 { v23.8h }, [x6], #16
smlal v25.4s, v9.4h, v11.4h
smlal2 v5.4s, v9.8h, v11.8h
smlal2 v26.4s, v9.8h, v23.8h
smlal v19.4s, v9.4h, v23.4h
ldr q9, [x4], #0x20
uzp1 v11.8h, v25.8h, v5.8h
uzp1 v23.8h, v19.8h, v26.8h
mul v11.8h, v11.8h, v2.8h
mul v23.8h, v23.8h, v2.8h
ldr q7, [x5], #0x20
smlal2 v5.4s, v11.8h, v0.8h
smlal v25.4s, v11.4h, v0.4h
ldr q11, [x2], #0x20
ldur q21, [x2, #-0x10]
ldur q6, [x4, #-0x10]
uzp1 v17.8h, v11.8h, v21.8h
ldr q10, [x1], #0x20
ldur q29, [x1, #-0x10]
uzp2 v11.8h, v11.8h, v21.8h
uzp1 v13.8h, v9.8h, v6.8h
uzp1 v3.8h, v10.8h, v29.8h
uzp2 v10.8h, v10.8h, v29.8h
smull v12.4s, v3.4h, v11.4h
smull2 v11.4s, v3.8h, v11.8h
ldur q21, [x5, #-0x10]
smlal v12.4s, v10.4h, v17.4h
smlal2 v11.4s, v10.8h, v17.8h
uzp2 v29.8h, v7.8h, v21.8h
uzp1 v15.8h, v7.8h, v21.8h
smlal v12.4s, v13.4h, v29.4h
smlal2 v11.4s, v13.8h, v29.8h
uzp2 v28.8h, v9.8h, v6.8h
smlal2 v26.4s, v23.8h, v0.8h
smlal v12.4s, v28.4h, v15.4h
smlal2 v11.4s, v28.8h, v15.8h
smlal v19.4s, v23.4h, v0.4h
uzp2 v27.8h, v25.8h, v5.8h
smull v23.4s, v3.4h, v17.4h
uzp1 v9.8h, v12.8h, v11.8h
uzp2 v19.8h, v19.8h, v26.8h
mul v14.8h, v9.8h, v2.8h
ld1 { v22.8h }, [x6], #16
zip2 v9.8h, v19.8h, v27.8h
smlal2 v11.4s, v14.8h, v0.8h
ld1 { v4.8h }, [x3], #16
sub x13, x13, #0x2
Lpolyvec_basemul_acc_montgomery_cached_k2_loop:
smull2 v20.4s, v3.8h, v17.8h
ldr q18, [x4], #0x20
ldr q30, [x5], #0x20
smlal2 v20.4s, v10.8h, v4.8h
smlal v12.4s, v14.4h, v0.4h
smlal v23.4s, v10.4h, v4.4h
str q9, [x0, #0x10]
smlal2 v20.4s, v13.8h, v15.8h
ldr q8, [x2], #0x20
smlal v23.4s, v13.4h, v15.4h
smlal2 v20.4s, v28.8h, v22.8h
zip1 v26.8h, v19.8h, v27.8h
ldur q9, [x2, #-0x10]
smlal v23.4s, v28.4h, v22.4h
uzp2 v27.8h, v12.8h, v11.8h
uzp1 v17.8h, v8.8h, v9.8h
uzp2 v4.8h, v8.8h, v9.8h
uzp1 v5.8h, v23.8h, v20.8h
str q26, [x0], #0x20
mul v31.8h, v5.8h, v2.8h
ldur q19, [x4, #-0x10]
ldr q29, [x1], #0x20
ldur q12, [x1, #-0x10]
smlal2 v20.4s, v31.8h, v0.8h
uzp1 v13.8h, v18.8h, v19.8h
uzp1 v3.8h, v29.8h, v12.8h
uzp2 v10.8h, v29.8h, v12.8h
smull v12.4s, v3.4h, v4.4h
smull2 v11.4s, v3.8h, v4.8h
ldur q5, [x5, #-0x10]
smlal v12.4s, v10.4h, v17.4h
smlal2 v11.4s, v10.8h, v17.8h
uzp2 v14.8h, v30.8h, v5.8h
uzp1 v15.8h, v30.8h, v5.8h
smlal v12.4s, v13.4h, v14.4h
smlal2 v11.4s, v13.8h, v14.8h
uzp2 v28.8h, v18.8h, v19.8h
smlal v23.4s, v31.4h, v0.4h
smlal v12.4s, v28.4h, v15.4h
smlal2 v11.4s, v28.8h, v15.8h
ld1 { v22.8h }, [x6], #16
uzp2 v19.8h, v23.8h, v20.8h
uzp1 v1.8h, v12.8h, v11.8h
smull v23.4s, v3.4h, v17.4h
mul v14.8h, v1.8h, v2.8h
zip2 v9.8h, v19.8h, v27.8h
ld1 { v4.8h }, [x3], #16
smlal2 v11.4s, v14.8h, v0.8h
sub x13, x13, #0x1
cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k2_loop
smull2 v5.4s, v3.8h, v17.8h
smlal v12.4s, v14.4h, v0.4h
smlal v23.4s, v10.4h, v4.4h
str q9, [x0, #0x10]
smlal2 v5.4s, v10.8h, v4.8h
uzp2 v11.8h, v12.8h, v11.8h
zip1 v9.8h, v19.8h, v27.8h
smlal v23.4s, v13.4h, v15.4h
smlal2 v5.4s, v13.8h, v15.8h
str q9, [x0], #0x20
smlal v23.4s, v28.4h, v22.4h
smlal2 v5.4s, v28.8h, v22.8h
uzp1 v9.8h, v23.8h, v5.8h
mul v9.8h, v9.8h, v2.8h
smlal2 v5.4s, v9.8h, v0.8h
smlal v23.4s, v9.4h, v0.4h
uzp2 v9.8h, v23.8h, v5.8h
zip2 v5.8h, v9.8h, v11.8h
zip1 v9.8h, v9.8h, v11.8h
str q5, [x0, #0x10]
str q9, [x0], #0x20
ldp d8, d9, [sp]
.cfi_restore d8
.cfi_restore d9
ldp d10, d11, [sp, #0x10]
.cfi_restore d10
.cfi_restore d11
ldp d12, d13, [sp, #0x20]
.cfi_restore d12
.cfi_restore d13
ldp d14, d15, [sp, #0x30]
.cfi_restore d14
.cfi_restore d15
add sp, sp, #0x40
.cfi_adjust_cfa_offset -0x40
ret
.cfi_endproc

View File

@@ -0,0 +1,305 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [NeonNTT]
* Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
* Becker, Hwang, Kannwischer, Yang, Yang
* https://eprint.iacr.org/2021/986
*/
/*yaml
Name: polyvec_basemul_acc_montgomery_cached_asm_k3
Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=3
Signature: void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(int16_t r[256], const int16_t a[768], const int16_t b[768], const int16_t b_cache[384])
ABI:
x0:
type: buffer
size_bytes: 512
permissions: write-only
c_parameter: int16_t r[256]
description: Output polynomial
x1:
type: buffer
size_bytes: 1536
permissions: read-only
c_parameter: const int16_t a[768]
description: Input polynomial vector a
x2:
type: buffer
size_bytes: 1536
permissions: read-only
c_parameter: const int16_t b[768]
description: Input polynomial vector b
x3:
type: buffer
size_bytes: 768
permissions: read-only
c_parameter: const int16_t b_cache[384]
description: Cached values for b
Stack:
bytes: 64
description: saving callee-saved Neon registers
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k3)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k3)
S2N_BN_SYMBOL(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k3):
.cfi_startproc
sub sp, sp, #0x40
.cfi_adjust_cfa_offset 0x40
stp d8, d9, [sp]
.cfi_rel_offset d8, 0x0
.cfi_rel_offset d9, 0x8
stp d10, d11, [sp, #0x10]
.cfi_rel_offset d10, 0x10
.cfi_rel_offset d11, 0x18
stp d12, d13, [sp, #0x20]
.cfi_rel_offset d12, 0x20
.cfi_rel_offset d13, 0x28
stp d14, d15, [sp, #0x30]
.cfi_rel_offset d14, 0x30
.cfi_rel_offset d15, 0x38
mov w14, #0xd01 // =3329
dup v0.8h, w14
mov w14, #0xcff // =3327
dup v2.8h, w14
add x4, x1, #0x200
add x5, x2, #0x200
add x6, x3, #0x100
add x7, x1, #0x400
add x8, x2, #0x400
add x9, x3, #0x200
mov x13, #0x10 // =16
ldr q7, [x2, #0x10]
ldr q20, [x2], #0x20
ldr q15, [x1, #0x10]
uzp1 v8.8h, v20.8h, v7.8h
uzp2 v7.8h, v20.8h, v7.8h
ld1 { v20.8h }, [x3], #16
ldr q30, [x1], #0x20
ldr q11, [x4], #0x20
uzp1 v16.8h, v30.8h, v15.8h
uzp2 v15.8h, v30.8h, v15.8h
smull v30.4s, v16.4h, v7.4h
smull2 v7.4s, v16.8h, v7.8h
smull v9.4s, v16.4h, v8.4h
smull2 v16.4s, v16.8h, v8.8h
smlal v30.4s, v15.4h, v8.4h
smlal2 v7.4s, v15.8h, v8.8h
smlal v9.4s, v15.4h, v20.4h
smlal2 v16.4s, v15.8h, v20.8h
ldur q20, [x4, #-0x10]
ldr q15, [x5], #0x20
uzp1 v8.8h, v11.8h, v20.8h
uzp2 v20.8h, v11.8h, v20.8h
ldur q11, [x5, #-0x10]
ld1 { v27.8h }, [x6], #16
uzp1 v10.8h, v15.8h, v11.8h
uzp2 v15.8h, v15.8h, v11.8h
smlal v9.4s, v8.4h, v10.4h
smlal2 v16.4s, v8.8h, v10.8h
smlal v30.4s, v8.4h, v15.4h
smlal2 v7.4s, v8.8h, v15.8h
smlal v9.4s, v20.4h, v27.4h
smlal2 v16.4s, v20.8h, v27.8h
smlal v30.4s, v20.4h, v10.4h
smlal2 v7.4s, v20.8h, v10.8h
ldr q20, [x7], #0x20
ldur q15, [x7, #-0x10]
ldr q8, [x8], #0x20
uzp1 v11.8h, v20.8h, v15.8h
uzp2 v20.8h, v20.8h, v15.8h
ldur q15, [x8, #-0x10]
ld1 { v27.8h }, [x9], #16
uzp1 v10.8h, v8.8h, v15.8h
uzp2 v15.8h, v8.8h, v15.8h
smlal v9.4s, v11.4h, v10.4h
smlal2 v16.4s, v11.8h, v10.8h
smlal v30.4s, v11.4h, v15.4h
smlal2 v7.4s, v11.8h, v15.8h
smlal v9.4s, v20.4h, v27.4h
smlal2 v16.4s, v20.8h, v27.8h
smlal v30.4s, v20.4h, v10.4h
smlal2 v7.4s, v20.8h, v10.8h
ldr q15, [x2], #0x20
uzp1 v20.8h, v9.8h, v16.8h
uzp1 v8.8h, v30.8h, v7.8h
mul v20.8h, v20.8h, v2.8h
mul v8.8h, v8.8h, v2.8h
ldr q21, [x4], #0x20
smlal v9.4s, v20.4h, v0.4h
smlal2 v16.4s, v20.8h, v0.8h
smlal v30.4s, v8.4h, v0.4h
smlal2 v7.4s, v8.8h, v0.8h
ldur q6, [x4, #-0x10]
uzp2 v27.8h, v9.8h, v16.8h
uzp2 v10.8h, v30.8h, v7.8h
ldur q16, [x2, #-0x10]
ldr q30, [x1, #0x10]
ld1 { v9.8h }, [x3], #16
ldr q1, [x5], #0x20
ldur q12, [x5, #-0x10]
ld1 { v24.8h }, [x6], #16
ldr q19, [x7], #0x20
ldur q31, [x7, #-0x10]
ldr q17, [x8], #0x20
ldur q18, [x8, #-0x10]
ld1 { v25.8h }, [x9], #16
sub x13, x13, #0x2
Lpolyvec_basemul_acc_montgomery_cached_k3_loop:
ldr q20, [x1], #0x20
uzp1 v7.8h, v15.8h, v16.8h
uzp2 v15.8h, v15.8h, v16.8h
uzp1 v8.8h, v20.8h, v30.8h
uzp2 v20.8h, v20.8h, v30.8h
smull v30.4s, v8.4h, v15.4h
smull2 v15.4s, v8.8h, v15.8h
smull v11.4s, v8.4h, v7.4h
smull2 v8.4s, v8.8h, v7.8h
smlal v30.4s, v20.4h, v7.4h
smlal2 v15.4s, v20.8h, v7.8h
smlal v11.4s, v20.4h, v9.4h
smlal2 v8.4s, v20.8h, v9.8h
uzp1 v7.8h, v21.8h, v6.8h
uzp2 v20.8h, v21.8h, v6.8h
uzp1 v16.8h, v1.8h, v12.8h
uzp2 v9.8h, v1.8h, v12.8h
smlal v11.4s, v7.4h, v16.4h
smlal2 v8.4s, v7.8h, v16.8h
smlal v30.4s, v7.4h, v9.4h
smlal2 v15.4s, v7.8h, v9.8h
smlal v11.4s, v20.4h, v24.4h
smlal2 v8.4s, v20.8h, v24.8h
smlal v30.4s, v20.4h, v16.4h
smlal2 v15.4s, v20.8h, v16.8h
uzp1 v7.8h, v19.8h, v31.8h
uzp2 v20.8h, v19.8h, v31.8h
uzp1 v16.8h, v17.8h, v18.8h
uzp2 v9.8h, v17.8h, v18.8h
smlal v11.4s, v7.4h, v16.4h
smlal2 v8.4s, v7.8h, v16.8h
smlal v30.4s, v7.4h, v9.4h
smlal2 v15.4s, v7.8h, v9.8h
smlal v11.4s, v20.4h, v25.4h
smlal2 v8.4s, v20.8h, v25.8h
smlal v30.4s, v20.4h, v16.4h
smlal2 v15.4s, v20.8h, v16.8h
ldr q16, [x2, #0x10]
uzp1 v7.8h, v11.8h, v8.8h
uzp1 v20.8h, v30.8h, v15.8h
mul v7.8h, v7.8h, v2.8h
mul v20.8h, v20.8h, v2.8h
zip2 v9.8h, v27.8h, v10.8h
zip1 v27.8h, v27.8h, v10.8h
smlal v11.4s, v7.4h, v0.4h
smlal2 v8.4s, v7.8h, v0.8h
smlal v30.4s, v20.4h, v0.4h
smlal2 v15.4s, v20.8h, v0.8h
str q27, [x0], #0x20
uzp2 v27.8h, v11.8h, v8.8h
stur q9, [x0, #-0x10]
uzp2 v10.8h, v30.8h, v15.8h
ldr q30, [x1, #0x10]
ldr q15, [x2], #0x20
ld1 { v9.8h }, [x3], #16
ldr q21, [x4], #0x20
ldur q6, [x4, #-0x10]
ldr q1, [x5], #0x20
ldur q12, [x5, #-0x10]
ld1 { v24.8h }, [x6], #16
ldr q19, [x7], #0x20
ldur q31, [x7, #-0x10]
ldr q17, [x8], #0x20
ldur q18, [x8, #-0x10]
ld1 { v25.8h }, [x9], #16
sub x13, x13, #0x1
cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k3_loop
ldr q7, [x1], #0x20
uzp1 v20.8h, v15.8h, v16.8h
uzp2 v15.8h, v15.8h, v16.8h
uzp1 v23.8h, v7.8h, v30.8h
uzp2 v11.8h, v7.8h, v30.8h
smull2 v8.4s, v23.8h, v20.8h
smull v5.4s, v23.4h, v20.4h
smull2 v30.4s, v23.8h, v15.8h
uzp1 v28.8h, v1.8h, v12.8h
smlal2 v8.4s, v11.8h, v9.8h
smlal v5.4s, v11.4h, v9.4h
uzp1 v3.8h, v21.8h, v6.8h
smull v16.4s, v23.4h, v15.4h
smlal2 v8.4s, v3.8h, v28.8h
smlal v5.4s, v3.4h, v28.4h
uzp2 v29.8h, v21.8h, v6.8h
uzp1 v7.8h, v17.8h, v18.8h
smlal2 v8.4s, v29.8h, v24.8h
uzp1 v14.8h, v19.8h, v31.8h
smlal v16.4s, v11.4h, v20.4h
smlal2 v30.4s, v11.8h, v20.8h
smlal2 v8.4s, v14.8h, v7.8h
uzp2 v20.8h, v1.8h, v12.8h
uzp2 v21.8h, v19.8h, v31.8h
smlal2 v30.4s, v3.8h, v20.8h
smlal v16.4s, v3.4h, v20.4h
smlal v5.4s, v29.4h, v24.4h
uzp2 v9.8h, v17.8h, v18.8h
smlal2 v30.4s, v29.8h, v28.8h
smlal v16.4s, v29.4h, v28.4h
smlal v5.4s, v14.4h, v7.4h
smlal2 v8.4s, v21.8h, v25.8h
smlal2 v30.4s, v14.8h, v9.8h
smlal v16.4s, v14.4h, v9.4h
smlal v5.4s, v21.4h, v25.4h
zip1 v20.8h, v27.8h, v10.8h
smlal2 v30.4s, v21.8h, v7.8h
smlal v16.4s, v21.4h, v7.4h
uzp1 v7.8h, v5.8h, v8.8h
str q20, [x0], #0x20
mul v15.8h, v7.8h, v2.8h
uzp1 v7.8h, v16.8h, v30.8h
zip2 v31.8h, v27.8h, v10.8h
mul v20.8h, v7.8h, v2.8h
smlal v5.4s, v15.4h, v0.4h
smlal2 v8.4s, v15.8h, v0.8h
stur q31, [x0, #-0x10]
smlal2 v30.4s, v20.8h, v0.8h
smlal v16.4s, v20.4h, v0.4h
uzp2 v15.8h, v5.8h, v8.8h
uzp2 v20.8h, v16.8h, v30.8h
zip1 v7.8h, v15.8h, v20.8h
zip2 v20.8h, v15.8h, v20.8h
str q7, [x0], #0x20
stur q20, [x0, #-0x10]
ldp d8, d9, [sp]
.cfi_restore d8
.cfi_restore d9
ldp d10, d11, [sp, #0x10]
.cfi_restore d10
.cfi_restore d11
ldp d12, d13, [sp, #0x20]
.cfi_restore d12
.cfi_restore d13
ldp d14, d15, [sp, #0x30]
.cfi_restore d14
.cfi_restore d15
add sp, sp, #0x40
.cfi_adjust_cfa_offset -0x40
ret
.cfi_endproc

View File

@@ -0,0 +1,359 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [NeonNTT]
* Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
* Becker, Hwang, Kannwischer, Yang, Yang
* https://eprint.iacr.org/2021/986
*/
/*yaml
Name: polyvec_basemul_acc_montgomery_cached_asm_k4
Description: Re-implementation of asymmetric base multiplication following @[NeonNTT] for k=4
Signature: void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(int16_t r[256], const int16_t a[1024], const int16_t b[1024], const int16_t b_cache[512])
ABI:
x0:
type: buffer
size_bytes: 512
permissions: write-only
c_parameter: int16_t r[256]
description: Output polynomial
x1:
type: buffer
size_bytes: 2048
permissions: read-only
c_parameter: const int16_t a[1024]
description: Input polynomial vector a
x2:
type: buffer
size_bytes: 2048
permissions: read-only
c_parameter: const int16_t b[1024]
description: Input polynomial vector b
x3:
type: buffer
size_bytes: 1024
permissions: read-only
c_parameter: const int16_t b_cache[512]
description: Cached values for b
Stack:
bytes: 64
description: saving callee-saved Neon registers
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k4)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k4)
S2N_BN_SYMBOL(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k4):
.cfi_startproc
sub sp, sp, #0x40
.cfi_adjust_cfa_offset 0x40
stp d8, d9, [sp]
.cfi_rel_offset d8, 0x0
.cfi_rel_offset d9, 0x8
stp d10, d11, [sp, #0x10]
.cfi_rel_offset d10, 0x10
.cfi_rel_offset d11, 0x18
stp d12, d13, [sp, #0x20]
.cfi_rel_offset d12, 0x20
.cfi_rel_offset d13, 0x28
stp d14, d15, [sp, #0x30]
.cfi_rel_offset d14, 0x30
.cfi_rel_offset d15, 0x38
mov w14, #0xd01 // =3329
dup v0.8h, w14
mov w14, #0xcff // =3327
dup v2.8h, w14
add x4, x1, #0x200
add x5, x2, #0x200
add x6, x3, #0x100
add x7, x1, #0x400
add x8, x2, #0x400
add x9, x3, #0x200
add x10, x1, #0x600
add x11, x2, #0x600
add x12, x3, #0x300
mov x13, #0x10 // =16
ldr q23, [x2, #0x10]
ldr q19, [x2], #0x20
ldr q17, [x5], #0x20
uzp2 v13.8h, v19.8h, v23.8h
uzp1 v19.8h, v19.8h, v23.8h
ldur q23, [x5, #-0x10]
ldr q30, [x1, #0x10]
uzp2 v9.8h, v17.8h, v23.8h
uzp1 v23.8h, v17.8h, v23.8h
ldr q17, [x1], #0x20
ldr q10, [x7, #0x10]
uzp1 v12.8h, v17.8h, v30.8h
uzp2 v17.8h, v17.8h, v30.8h
smull2 v30.4s, v12.8h, v13.8h
smull v13.4s, v12.4h, v13.4h
smull2 v22.4s, v12.8h, v19.8h
smull v12.4s, v12.4h, v19.4h
smlal2 v30.4s, v17.8h, v19.8h
smlal v13.4s, v17.4h, v19.4h
ldr q19, [x4], #0x20
ldur q16, [x4, #-0x10]
ld1 { v8.8h }, [x3], #16
uzp1 v26.8h, v19.8h, v16.8h
uzp2 v19.8h, v19.8h, v16.8h
smlal2 v30.4s, v26.8h, v9.8h
smlal v13.4s, v26.4h, v9.4h
smlal2 v22.4s, v17.8h, v8.8h
smlal v12.4s, v17.4h, v8.4h
smlal2 v30.4s, v19.8h, v23.8h
smlal v13.4s, v19.4h, v23.4h
smlal2 v22.4s, v26.8h, v23.8h
smlal v12.4s, v26.4h, v23.4h
ldr q23, [x7], #0x20
ldr q17, [x8, #0x10]
uzp1 v9.8h, v23.8h, v10.8h
uzp2 v23.8h, v23.8h, v10.8h
ldr q10, [x10], #0x20
ldur q16, [x10, #-0x10]
ld1 { v8.8h }, [x12], #16
uzp1 v26.8h, v10.8h, v16.8h
uzp2 v10.8h, v10.8h, v16.8h
ld1 { v16.8h }, [x6], #16
ldr q3, [x11, #0x10]
smlal2 v22.4s, v19.8h, v16.8h
smlal v12.4s, v19.4h, v16.4h
ldr q19, [x11], #0x20
ld1 { v16.8h }, [x9], #16
uzp1 v4.8h, v19.8h, v3.8h
uzp2 v19.8h, v19.8h, v3.8h
ldr q3, [x8], #0x20
ldr q31, [x2], #0x20
uzp1 v6.8h, v3.8h, v17.8h
uzp2 v17.8h, v3.8h, v17.8h
smlal2 v22.4s, v9.8h, v6.8h
smlal2 v30.4s, v9.8h, v17.8h
smlal v13.4s, v9.4h, v17.4h
smlal v12.4s, v9.4h, v6.4h
smlal2 v22.4s, v23.8h, v16.8h
smlal2 v30.4s, v23.8h, v6.8h
smlal v13.4s, v23.4h, v6.4h
smlal v12.4s, v23.4h, v16.4h
smlal2 v22.4s, v26.8h, v4.8h
smlal2 v30.4s, v26.8h, v19.8h
smlal v13.4s, v26.4h, v19.4h
smlal v12.4s, v26.4h, v4.4h
smlal2 v22.4s, v10.8h, v8.8h
smlal2 v30.4s, v10.8h, v4.8h
smlal v13.4s, v10.4h, v4.4h
smlal v12.4s, v10.4h, v8.4h
ldur q19, [x2, #-0x10]
uzp1 v23.8h, v13.8h, v30.8h
uzp1 v17.8h, v12.8h, v22.8h
mul v23.8h, v23.8h, v2.8h
uzp2 v21.8h, v31.8h, v19.8h
uzp1 v19.8h, v31.8h, v19.8h
mul v17.8h, v17.8h, v2.8h
smlal v13.4s, v23.4h, v0.4h
smlal2 v30.4s, v23.8h, v0.8h
ldr q23, [x5], #0x20
smlal2 v22.4s, v17.8h, v0.8h
uzp2 v15.8h, v13.8h, v30.8h
smlal v12.4s, v17.4h, v0.4h
ldur q17, [x5, #-0x10]
ldr q13, [x1, #0x10]
uzp2 v27.8h, v23.8h, v17.8h
uzp1 v28.8h, v23.8h, v17.8h
uzp2 v7.8h, v12.8h, v22.8h
ldr q23, [x1], #0x20
zip1 v5.8h, v7.8h, v15.8h
ldr q3, [x7, #0x10]
uzp1 v31.8h, v23.8h, v13.8h
uzp2 v16.8h, v23.8h, v13.8h
smull2 v24.4s, v31.8h, v21.8h
ldr q6, [x8, #0x10]
ldr q23, [x10], #0x20
smlal2 v24.4s, v16.8h, v19.8h
ldur q17, [x10, #-0x10]
ld1 { v22.8h }, [x12], #16
uzp1 v30.8h, v23.8h, v17.8h
uzp2 v11.8h, v23.8h, v17.8h
ldr q23, [x4], #0x20
ldur q17, [x4, #-0x10]
ldr q4, [x7], #0x20
uzp1 v20.8h, v23.8h, v17.8h
uzp2 v26.8h, v23.8h, v17.8h
uzp1 v9.8h, v4.8h, v3.8h
smlal2 v24.4s, v20.8h, v27.8h
ld1 { v8.8h }, [x6], #16
ldr q25, [x11, #0x10]
ldr q29, [x11], #0x20
ld1 { v12.8h }, [x9], #16
uzp1 v10.8h, v29.8h, v25.8h
ldr q14, [x8], #0x20
ld1 { v23.8h }, [x3], #16
sub x13, x13, #0x2
Lpolyvec_basemul_acc_montgomery_cached_k4_loop:
smlal2 v24.4s, v26.8h, v28.8h
uzp2 v4.8h, v4.8h, v3.8h
smull2 v13.4s, v31.8h, v19.8h
ldr q3, [x2], #0x20
uzp2 v1.8h, v29.8h, v25.8h
smlal2 v13.4s, v16.8h, v23.8h
ldur q17, [x2, #-0x10]
smull v18.4s, v31.4h, v19.4h
smlal2 v13.4s, v20.8h, v28.8h
smull v29.4s, v31.4h, v21.4h
ldr q21, [x5], #0x20
smlal2 v13.4s, v26.8h, v8.8h
smlal v29.4s, v16.4h, v19.4h
ldur q19, [x5, #-0x10]
smlal v18.4s, v16.4h, v23.4h
smlal v29.4s, v20.4h, v27.4h
uzp1 v31.8h, v14.8h, v6.8h
uzp2 v27.8h, v21.8h, v19.8h
smlal v18.4s, v20.4h, v28.4h
ldr q25, [x1, #0x10]
smlal v29.4s, v26.4h, v28.4h
smlal v18.4s, v26.4h, v8.4h
uzp2 v26.8h, v14.8h, v6.8h
smlal2 v13.4s, v9.8h, v31.8h
smlal2 v24.4s, v9.8h, v26.8h
smlal v29.4s, v9.4h, v26.4h
smlal v18.4s, v9.4h, v31.4h
smlal2 v13.4s, v4.8h, v12.8h
smlal2 v24.4s, v4.8h, v31.8h
smlal v29.4s, v4.4h, v31.4h
smlal v18.4s, v4.4h, v12.4h
smlal2 v13.4s, v30.8h, v10.8h
smlal2 v24.4s, v30.8h, v1.8h
smlal v29.4s, v30.4h, v1.4h
smlal v18.4s, v30.4h, v10.4h
smlal2 v13.4s, v11.8h, v22.8h
smlal2 v24.4s, v11.8h, v10.8h
smlal v29.4s, v11.4h, v10.4h
smlal v18.4s, v11.4h, v22.4h
ldr q22, [x1], #0x20
uzp1 v31.8h, v29.8h, v24.8h
uzp1 v28.8h, v21.8h, v19.8h
mul v19.8h, v31.8h, v2.8h
uzp1 v31.8h, v22.8h, v25.8h
uzp2 v16.8h, v22.8h, v25.8h
uzp2 v21.8h, v3.8h, v17.8h
smlal v29.4s, v19.4h, v0.4h
smlal2 v24.4s, v19.8h, v0.8h
uzp1 v19.8h, v3.8h, v17.8h
uzp1 v26.8h, v18.8h, v13.8h
zip2 v14.8h, v7.8h, v15.8h
mul v23.8h, v26.8h, v2.8h
uzp2 v15.8h, v29.8h, v24.8h
smull2 v24.4s, v31.8h, v21.8h
str q14, [x0, #0x10]
ldr q3, [x7, #0x10]
ldr q6, [x8, #0x10]
ldr q8, [x10], #0x20
ldur q26, [x10, #-0x10]
ld1 { v22.8h }, [x12], #16
uzp1 v30.8h, v8.8h, v26.8h
uzp2 v11.8h, v8.8h, v26.8h
ldr q8, [x4], #0x20
ldur q26, [x4, #-0x10]
ldr q4, [x7], #0x20
uzp1 v20.8h, v8.8h, v26.8h
uzp2 v26.8h, v8.8h, v26.8h
ld1 { v8.8h }, [x6], #16
uzp1 v9.8h, v4.8h, v3.8h
ldr q25, [x11, #0x10]
ldr q29, [x11], #0x20
ld1 { v12.8h }, [x9], #16
ldr q14, [x8], #0x20
smlal2 v24.4s, v16.8h, v19.8h
smlal2 v13.4s, v23.8h, v0.8h
smlal v18.4s, v23.4h, v0.4h
ld1 { v23.8h }, [x3], #16
smlal2 v24.4s, v20.8h, v27.8h
uzp2 v7.8h, v18.8h, v13.8h
uzp1 v10.8h, v29.8h, v25.8h
str q5, [x0], #0x20
zip1 v5.8h, v7.8h, v15.8h
sub x13, x13, #0x1
cbnz x13, Lpolyvec_basemul_acc_montgomery_cached_k4_loop
smull2 v17.4s, v31.8h, v19.8h
uzp2 v1.8h, v14.8h, v6.8h
smull v18.4s, v31.4h, v21.4h
smlal2 v24.4s, v26.8h, v28.8h
smlal2 v17.4s, v16.8h, v23.8h
smull v21.4s, v31.4h, v19.4h
smlal v18.4s, v16.4h, v19.4h
uzp2 v31.8h, v4.8h, v3.8h
uzp1 v3.8h, v14.8h, v6.8h
smlal v21.4s, v16.4h, v23.4h
smlal v18.4s, v20.4h, v27.4h
uzp2 v14.8h, v29.8h, v25.8h
smlal2 v17.4s, v20.8h, v28.8h
smlal v21.4s, v20.4h, v28.4h
smlal v18.4s, v26.4h, v28.4h
smlal2 v24.4s, v9.8h, v1.8h
smlal2 v17.4s, v26.8h, v8.8h
smlal v21.4s, v26.4h, v8.4h
smlal v18.4s, v9.4h, v1.4h
smlal2 v24.4s, v31.8h, v3.8h
smlal2 v17.4s, v9.8h, v3.8h
smlal v21.4s, v9.4h, v3.4h
smlal v18.4s, v31.4h, v3.4h
smlal2 v24.4s, v30.8h, v14.8h
smlal2 v17.4s, v31.8h, v12.8h
smlal v21.4s, v31.4h, v12.4h
smlal v18.4s, v30.4h, v14.4h
smlal2 v24.4s, v11.8h, v10.8h
smlal2 v17.4s, v30.8h, v10.8h
smlal v21.4s, v30.4h, v10.4h
smlal v18.4s, v11.4h, v10.4h
zip2 v19.8h, v7.8h, v15.8h
smlal2 v17.4s, v11.8h, v22.8h
smlal v21.4s, v11.4h, v22.4h
uzp1 v23.8h, v18.8h, v24.8h
str q19, [x0, #0x10]
mul v19.8h, v23.8h, v2.8h
uzp1 v23.8h, v21.8h, v17.8h
str q5, [x0], #0x20
mul v26.8h, v23.8h, v2.8h
smlal v18.4s, v19.4h, v0.4h
smlal2 v24.4s, v19.8h, v0.8h
smlal v21.4s, v26.4h, v0.4h
smlal2 v17.4s, v26.8h, v0.8h
uzp2 v13.8h, v18.8h, v24.8h
uzp2 v19.8h, v21.8h, v17.8h
zip1 v23.8h, v19.8h, v13.8h
zip2 v19.8h, v19.8h, v13.8h
str q23, [x0], #0x20
stur q19, [x0, #-0x10]
ldp d8, d9, [sp]
.cfi_restore d8
.cfi_restore d9
ldp d10, d11, [sp, #0x10]
.cfi_restore d10
.cfi_restore d11
ldp d12, d13, [sp, #0x20]
.cfi_restore d12
.cfi_restore d13
ldp d14, d15, [sp, #0x30]
.cfi_restore d14
.cfi_restore d15
add sp, sp, #0x40
.cfi_adjust_cfa_offset -0x40
ret
.cfi_endproc

View File

@@ -0,0 +1,219 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*yaml
Name: rej_uniform_asm
Description: Run rejection sampling on uniform random bytes to generate uniform random integers mod q
Signature: uint64_t mlk_rej_uniform_asm(int16_t r[256], const uint8_t *buf, unsigned buflen, const uint8_t table[2048])
ABI:
x0:
type: buffer
size_bytes: 512
permissions: write-only
c_parameter: int16_t r[256]
description: Output buffer
x1:
type: buffer
size_bytes: x2
permissions: read-only
c_parameter: const uint8_t *buf
description: Input buffer
x2:
type: scalar
c_parameter: unsigned buflen
description: Length of input buffer (must be multiple of 24)
test_with: 504 # MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE
x3:
type: buffer
size_bytes: 2048
permissions: read-only
c_parameter: const uint8_t table[2048]
description: Lookup table
Stack:
bytes: 576
description: register preservation and temporary storage
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/aarch64_opt/src/rej_uniform_asm.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_rej_uniform_asm)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_rej_uniform_asm)
S2N_BN_SYMBOL(mlkem_rej_uniform_asm):
.cfi_startproc
sub sp, sp, #0x240
.cfi_adjust_cfa_offset 0x240
mov x7, #0x1 // =1
movk x7, #0x2, lsl #16
movk x7, #0x4, lsl #32
movk x7, #0x8, lsl #48
mov v31.d[0], x7
mov x7, #0x10 // =16
movk x7, #0x20, lsl #16
movk x7, #0x40, lsl #32
movk x7, #0x80, lsl #48
mov v31.d[1], x7
mov w11, #0xd01 // =3329
dup v30.8h, w11
mov x8, sp
mov x7, x8
mov x11, #0x0 // =0
eor v16.16b, v16.16b, v16.16b
Lrej_uniform_initial_zero:
str q16, [x7], #0x40
stur q16, [x7, #-0x30]
stur q16, [x7, #-0x20]
stur q16, [x7, #-0x10]
add x11, x11, #0x20
cmp x11, #0x100
b.lt Lrej_uniform_initial_zero
mov x7, x8
mov x9, #0x0 // =0
mov x4, #0x100 // =256
cmp x2, #0x30
b.lo Lrej_uniform_loop48_end
Lrej_uniform_loop48:
cmp x9, x4
b.hs Lrej_uniform_memory_copy
sub x2, x2, #0x30
ld3 { v0.16b, v1.16b, v2.16b }, [x1], #48
zip1 v4.16b, v0.16b, v1.16b
zip2 v5.16b, v0.16b, v1.16b
zip1 v6.16b, v1.16b, v2.16b
zip2 v7.16b, v1.16b, v2.16b
bic v4.8h, #0xf0, lsl #8
bic v5.8h, #0xf0, lsl #8
ushr v6.8h, v6.8h, #0x4
ushr v7.8h, v7.8h, #0x4
zip1 v16.8h, v4.8h, v6.8h
zip2 v17.8h, v4.8h, v6.8h
zip1 v18.8h, v5.8h, v7.8h
zip2 v19.8h, v5.8h, v7.8h
cmhi v4.8h, v30.8h, v16.8h
cmhi v5.8h, v30.8h, v17.8h
cmhi v6.8h, v30.8h, v18.8h
cmhi v7.8h, v30.8h, v19.8h
and v4.16b, v4.16b, v31.16b
and v5.16b, v5.16b, v31.16b
and v6.16b, v6.16b, v31.16b
and v7.16b, v7.16b, v31.16b
uaddlv s20, v4.8h
uaddlv s21, v5.8h
uaddlv s22, v6.8h
uaddlv s23, v7.8h
fmov w12, s20
fmov w13, s21
fmov w14, s22
fmov w15, s23
ldr q24, [x3, x12, lsl #4]
ldr q25, [x3, x13, lsl #4]
ldr q26, [x3, x14, lsl #4]
ldr q27, [x3, x15, lsl #4]
cnt v4.16b, v4.16b
cnt v5.16b, v5.16b
cnt v6.16b, v6.16b
cnt v7.16b, v7.16b
uaddlv s20, v4.8h
uaddlv s21, v5.8h
uaddlv s22, v6.8h
uaddlv s23, v7.8h
fmov w12, s20
fmov w13, s21
fmov w14, s22
fmov w15, s23
tbl v16.16b, { v16.16b }, v24.16b
tbl v17.16b, { v17.16b }, v25.16b
tbl v18.16b, { v18.16b }, v26.16b
tbl v19.16b, { v19.16b }, v27.16b
str q16, [x7]
add x7, x7, x12, lsl #1
str q17, [x7]
add x7, x7, x13, lsl #1
str q18, [x7]
add x7, x7, x14, lsl #1
str q19, [x7]
add x7, x7, x15, lsl #1
add x12, x12, x13
add x14, x14, x15
add x9, x9, x12
add x9, x9, x14
cmp x2, #0x30
b.hs Lrej_uniform_loop48
Lrej_uniform_loop48_end:
cmp x9, x4
b.hs Lrej_uniform_memory_copy
cmp x2, #0x18
b.lo Lrej_uniform_memory_copy
sub x2, x2, #0x18
ld3 { v0.8b, v1.8b, v2.8b }, [x1], #24
zip1 v4.16b, v0.16b, v1.16b
zip1 v5.16b, v1.16b, v2.16b
bic v4.8h, #0xf0, lsl #8
ushr v5.8h, v5.8h, #0x4
zip1 v16.8h, v4.8h, v5.8h
zip2 v17.8h, v4.8h, v5.8h
cmhi v4.8h, v30.8h, v16.8h
cmhi v5.8h, v30.8h, v17.8h
and v4.16b, v4.16b, v31.16b
and v5.16b, v5.16b, v31.16b
uaddlv s20, v4.8h
uaddlv s21, v5.8h
fmov w12, s20
fmov w13, s21
ldr q24, [x3, x12, lsl #4]
ldr q25, [x3, x13, lsl #4]
cnt v4.16b, v4.16b
cnt v5.16b, v5.16b
uaddlv s20, v4.8h
uaddlv s21, v5.8h
fmov w12, s20
fmov w13, s21
tbl v16.16b, { v16.16b }, v24.16b
tbl v17.16b, { v17.16b }, v25.16b
str q16, [x7]
add x7, x7, x12, lsl #1
str q17, [x7]
add x7, x7, x13, lsl #1
add x9, x9, x12
add x9, x9, x13
Lrej_uniform_memory_copy:
cmp x9, x4
csel x9, x9, x4, lo
mov x11, #0x0 // =0
mov x7, x8
Lrej_uniform_final_copy:
ldr q16, [x7], #0x40
ldur q17, [x7, #-0x30]
ldur q18, [x7, #-0x20]
ldur q19, [x7, #-0x10]
str q16, [x0], #0x40
stur q17, [x0, #-0x30]
stur q18, [x0, #-0x20]
stur q19, [x0, #-0x10]
add x11, x11, #0x20
cmp x11, #0x100
b.lt Lrej_uniform_final_copy
mov x0, x9
b Lrej_uniform_return
Lrej_uniform_return:
add sp, sp, #0x240
.cfi_adjust_cfa_offset -0x240
ret
.cfi_endproc

View File

@@ -0,0 +1,543 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*
* WARNING: This file is auto-generated from scripts/autogen
* in the mlkem-native repository.
* Do not modify it directly.
*/
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_AARCH64) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include <stdint.h>
#include "arith_native_aarch64.h"
/*
* Lookup table used by rejection sampling of the public matrix.
* See autogen for details.
*/
MLK_ALIGN const uint8_t mlk_rej_uniform_table[] = {
255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 0 */,
0, 1, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 1 */,
2, 3, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 2 */,
0, 1, 2, 3, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 3 */,
4, 5, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 4 */,
0, 1, 4, 5, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 5 */,
2, 3, 4, 5, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 6 */,
0, 1, 2, 3, 4, 5, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 7 */,
6, 7, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 8 */,
0, 1, 6, 7, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 9 */,
2, 3, 6, 7, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 10 */,
0, 1, 2, 3, 6, 7, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 11 */,
4, 5, 6, 7, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 12 */,
0, 1, 4, 5, 6, 7, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 13 */,
2, 3, 4, 5, 6, 7, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 14 */,
0, 1, 2, 3, 4, 5, 6, 7,
255, 255, 255, 255, 255, 255, 255, 255 /* 15 */,
8, 9, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 16 */,
0, 1, 8, 9, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 17 */,
2, 3, 8, 9, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 18 */,
0, 1, 2, 3, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 19 */,
4, 5, 8, 9, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 20 */,
0, 1, 4, 5, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 21 */,
2, 3, 4, 5, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 22 */,
0, 1, 2, 3, 4, 5, 8, 9,
255, 255, 255, 255, 255, 255, 255, 255 /* 23 */,
6, 7, 8, 9, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 24 */,
0, 1, 6, 7, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 25 */,
2, 3, 6, 7, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 26 */,
0, 1, 2, 3, 6, 7, 8, 9,
255, 255, 255, 255, 255, 255, 255, 255 /* 27 */,
4, 5, 6, 7, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 28 */,
0, 1, 4, 5, 6, 7, 8, 9,
255, 255, 255, 255, 255, 255, 255, 255 /* 29 */,
2, 3, 4, 5, 6, 7, 8, 9,
255, 255, 255, 255, 255, 255, 255, 255 /* 30 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 255, 255, 255, 255, 255, 255 /* 31 */,
10, 11, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 32 */,
0, 1, 10, 11, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 33 */,
2, 3, 10, 11, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 34 */,
0, 1, 2, 3, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 35 */,
4, 5, 10, 11, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 36 */,
0, 1, 4, 5, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 37 */,
2, 3, 4, 5, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 38 */,
0, 1, 2, 3, 4, 5, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 39 */,
6, 7, 10, 11, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 40 */,
0, 1, 6, 7, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 41 */,
2, 3, 6, 7, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 42 */,
0, 1, 2, 3, 6, 7, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 43 */,
4, 5, 6, 7, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 44 */,
0, 1, 4, 5, 6, 7, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 45 */,
2, 3, 4, 5, 6, 7, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 46 */,
0, 1, 2, 3, 4, 5, 6, 7,
10, 11, 255, 255, 255, 255, 255, 255 /* 47 */,
8, 9, 10, 11, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 48 */,
0, 1, 8, 9, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 49 */,
2, 3, 8, 9, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 50 */,
0, 1, 2, 3, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 51 */,
4, 5, 8, 9, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 52 */,
0, 1, 4, 5, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 53 */,
2, 3, 4, 5, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 54 */,
0, 1, 2, 3, 4, 5, 8, 9,
10, 11, 255, 255, 255, 255, 255, 255 /* 55 */,
6, 7, 8, 9, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 56 */,
0, 1, 6, 7, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 57 */,
2, 3, 6, 7, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 58 */,
0, 1, 2, 3, 6, 7, 8, 9,
10, 11, 255, 255, 255, 255, 255, 255 /* 59 */,
4, 5, 6, 7, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 60 */,
0, 1, 4, 5, 6, 7, 8, 9,
10, 11, 255, 255, 255, 255, 255, 255 /* 61 */,
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 255, 255, 255, 255, 255, 255 /* 62 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 255, 255, 255, 255 /* 63 */,
12, 13, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 64 */,
0, 1, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 65 */,
2, 3, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 66 */,
0, 1, 2, 3, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 67 */,
4, 5, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 68 */,
0, 1, 4, 5, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 69 */,
2, 3, 4, 5, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 70 */,
0, 1, 2, 3, 4, 5, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 71 */,
6, 7, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 72 */,
0, 1, 6, 7, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 73 */,
2, 3, 6, 7, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 74 */,
0, 1, 2, 3, 6, 7, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 75 */,
4, 5, 6, 7, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 76 */,
0, 1, 4, 5, 6, 7, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 77 */,
2, 3, 4, 5, 6, 7, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 78 */,
0, 1, 2, 3, 4, 5, 6, 7,
12, 13, 255, 255, 255, 255, 255, 255 /* 79 */,
8, 9, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 80 */,
0, 1, 8, 9, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 81 */,
2, 3, 8, 9, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 82 */,
0, 1, 2, 3, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 83 */,
4, 5, 8, 9, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 84 */,
0, 1, 4, 5, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 85 */,
2, 3, 4, 5, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 86 */,
0, 1, 2, 3, 4, 5, 8, 9,
12, 13, 255, 255, 255, 255, 255, 255 /* 87 */,
6, 7, 8, 9, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 88 */,
0, 1, 6, 7, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 89 */,
2, 3, 6, 7, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 90 */,
0, 1, 2, 3, 6, 7, 8, 9,
12, 13, 255, 255, 255, 255, 255, 255 /* 91 */,
4, 5, 6, 7, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 92 */,
0, 1, 4, 5, 6, 7, 8, 9,
12, 13, 255, 255, 255, 255, 255, 255 /* 93 */,
2, 3, 4, 5, 6, 7, 8, 9,
12, 13, 255, 255, 255, 255, 255, 255 /* 94 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 12, 13, 255, 255, 255, 255 /* 95 */,
10, 11, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 96 */,
0, 1, 10, 11, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 97 */,
2, 3, 10, 11, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 98 */,
0, 1, 2, 3, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 99 */,
4, 5, 10, 11, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 100 */,
0, 1, 4, 5, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 101 */,
2, 3, 4, 5, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 102 */,
0, 1, 2, 3, 4, 5, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 103 */,
6, 7, 10, 11, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 104 */,
0, 1, 6, 7, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 105 */,
2, 3, 6, 7, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 106 */,
0, 1, 2, 3, 6, 7, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 107 */,
4, 5, 6, 7, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 108 */,
0, 1, 4, 5, 6, 7, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 109 */,
2, 3, 4, 5, 6, 7, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 110 */,
0, 1, 2, 3, 4, 5, 6, 7,
10, 11, 12, 13, 255, 255, 255, 255 /* 111 */,
8, 9, 10, 11, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 112 */,
0, 1, 8, 9, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 113 */,
2, 3, 8, 9, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 114 */,
0, 1, 2, 3, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 115 */,
4, 5, 8, 9, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 116 */,
0, 1, 4, 5, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 117 */,
2, 3, 4, 5, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 118 */,
0, 1, 2, 3, 4, 5, 8, 9,
10, 11, 12, 13, 255, 255, 255, 255 /* 119 */,
6, 7, 8, 9, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 120 */,
0, 1, 6, 7, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 121 */,
2, 3, 6, 7, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 122 */,
0, 1, 2, 3, 6, 7, 8, 9,
10, 11, 12, 13, 255, 255, 255, 255 /* 123 */,
4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 124 */,
0, 1, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 255, 255, 255, 255 /* 125 */,
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 255, 255, 255, 255 /* 126 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 255, 255 /* 127 */,
14, 15, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 128 */,
0, 1, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 129 */,
2, 3, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 130 */,
0, 1, 2, 3, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 131 */,
4, 5, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 132 */,
0, 1, 4, 5, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 133 */,
2, 3, 4, 5, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 134 */,
0, 1, 2, 3, 4, 5, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 135 */,
6, 7, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 136 */,
0, 1, 6, 7, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 137 */,
2, 3, 6, 7, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 138 */,
0, 1, 2, 3, 6, 7, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 139 */,
4, 5, 6, 7, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 140 */,
0, 1, 4, 5, 6, 7, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 141 */,
2, 3, 4, 5, 6, 7, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 142 */,
0, 1, 2, 3, 4, 5, 6, 7,
14, 15, 255, 255, 255, 255, 255, 255 /* 143 */,
8, 9, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 144 */,
0, 1, 8, 9, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 145 */,
2, 3, 8, 9, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 146 */,
0, 1, 2, 3, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 147 */,
4, 5, 8, 9, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 148 */,
0, 1, 4, 5, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 149 */,
2, 3, 4, 5, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 150 */,
0, 1, 2, 3, 4, 5, 8, 9,
14, 15, 255, 255, 255, 255, 255, 255 /* 151 */,
6, 7, 8, 9, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 152 */,
0, 1, 6, 7, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 153 */,
2, 3, 6, 7, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 154 */,
0, 1, 2, 3, 6, 7, 8, 9,
14, 15, 255, 255, 255, 255, 255, 255 /* 155 */,
4, 5, 6, 7, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 156 */,
0, 1, 4, 5, 6, 7, 8, 9,
14, 15, 255, 255, 255, 255, 255, 255 /* 157 */,
2, 3, 4, 5, 6, 7, 8, 9,
14, 15, 255, 255, 255, 255, 255, 255 /* 158 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 14, 15, 255, 255, 255, 255 /* 159 */,
10, 11, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 160 */,
0, 1, 10, 11, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 161 */,
2, 3, 10, 11, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 162 */,
0, 1, 2, 3, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 163 */,
4, 5, 10, 11, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 164 */,
0, 1, 4, 5, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 165 */,
2, 3, 4, 5, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 166 */,
0, 1, 2, 3, 4, 5, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 167 */,
6, 7, 10, 11, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 168 */,
0, 1, 6, 7, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 169 */,
2, 3, 6, 7, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 170 */,
0, 1, 2, 3, 6, 7, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 171 */,
4, 5, 6, 7, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 172 */,
0, 1, 4, 5, 6, 7, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 173 */,
2, 3, 4, 5, 6, 7, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 174 */,
0, 1, 2, 3, 4, 5, 6, 7,
10, 11, 14, 15, 255, 255, 255, 255 /* 175 */,
8, 9, 10, 11, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 176 */,
0, 1, 8, 9, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 177 */,
2, 3, 8, 9, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 178 */,
0, 1, 2, 3, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 179 */,
4, 5, 8, 9, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 180 */,
0, 1, 4, 5, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 181 */,
2, 3, 4, 5, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 182 */,
0, 1, 2, 3, 4, 5, 8, 9,
10, 11, 14, 15, 255, 255, 255, 255 /* 183 */,
6, 7, 8, 9, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 184 */,
0, 1, 6, 7, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 185 */,
2, 3, 6, 7, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 186 */,
0, 1, 2, 3, 6, 7, 8, 9,
10, 11, 14, 15, 255, 255, 255, 255 /* 187 */,
4, 5, 6, 7, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 188 */,
0, 1, 4, 5, 6, 7, 8, 9,
10, 11, 14, 15, 255, 255, 255, 255 /* 189 */,
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 14, 15, 255, 255, 255, 255 /* 190 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 14, 15, 255, 255 /* 191 */,
12, 13, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 192 */,
0, 1, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 193 */,
2, 3, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 194 */,
0, 1, 2, 3, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 195 */,
4, 5, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 196 */,
0, 1, 4, 5, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 197 */,
2, 3, 4, 5, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 198 */,
0, 1, 2, 3, 4, 5, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 199 */,
6, 7, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 200 */,
0, 1, 6, 7, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 201 */,
2, 3, 6, 7, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 202 */,
0, 1, 2, 3, 6, 7, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 203 */,
4, 5, 6, 7, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 204 */,
0, 1, 4, 5, 6, 7, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 205 */,
2, 3, 4, 5, 6, 7, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 206 */,
0, 1, 2, 3, 4, 5, 6, 7,
12, 13, 14, 15, 255, 255, 255, 255 /* 207 */,
8, 9, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 208 */,
0, 1, 8, 9, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 209 */,
2, 3, 8, 9, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 210 */,
0, 1, 2, 3, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 211 */,
4, 5, 8, 9, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 212 */,
0, 1, 4, 5, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 213 */,
2, 3, 4, 5, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 214 */,
0, 1, 2, 3, 4, 5, 8, 9,
12, 13, 14, 15, 255, 255, 255, 255 /* 215 */,
6, 7, 8, 9, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 216 */,
0, 1, 6, 7, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 217 */,
2, 3, 6, 7, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 218 */,
0, 1, 2, 3, 6, 7, 8, 9,
12, 13, 14, 15, 255, 255, 255, 255 /* 219 */,
4, 5, 6, 7, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 220 */,
0, 1, 4, 5, 6, 7, 8, 9,
12, 13, 14, 15, 255, 255, 255, 255 /* 221 */,
2, 3, 4, 5, 6, 7, 8, 9,
12, 13, 14, 15, 255, 255, 255, 255 /* 222 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 12, 13, 14, 15, 255, 255 /* 223 */,
10, 11, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 224 */,
0, 1, 10, 11, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 225 */,
2, 3, 10, 11, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 226 */,
0, 1, 2, 3, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 227 */,
4, 5, 10, 11, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 228 */,
0, 1, 4, 5, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 229 */,
2, 3, 4, 5, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 230 */,
0, 1, 2, 3, 4, 5, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 231 */,
6, 7, 10, 11, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 232 */,
0, 1, 6, 7, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 233 */,
2, 3, 6, 7, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 234 */,
0, 1, 2, 3, 6, 7, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 235 */,
4, 5, 6, 7, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 236 */,
0, 1, 4, 5, 6, 7, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 237 */,
2, 3, 4, 5, 6, 7, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 238 */,
0, 1, 2, 3, 4, 5, 6, 7,
10, 11, 12, 13, 14, 15, 255, 255 /* 239 */,
8, 9, 10, 11, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 240 */,
0, 1, 8, 9, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 241 */,
2, 3, 8, 9, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 242 */,
0, 1, 2, 3, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 243 */,
4, 5, 8, 9, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 244 */,
0, 1, 4, 5, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 245 */,
2, 3, 4, 5, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 246 */,
0, 1, 2, 3, 4, 5, 8, 9,
10, 11, 12, 13, 14, 15, 255, 255 /* 247 */,
6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 248 */,
0, 1, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 249 */,
2, 3, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 250 */,
0, 1, 2, 3, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 255, 255 /* 251 */,
4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 252 */,
0, 1, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 255, 255 /* 253 */,
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 255, 255 /* 254 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15 /* 255 */,
};
#else /* MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(aarch64_rej_uniform_table)
#endif /* !(MLK_ARITH_BACKEND_AARCH64 && !MLK_CONFIG_MULTILEVEL_NO_SHARED) */

View File

@@ -0,0 +1,569 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#ifndef MLK_NATIVE_API_H
#define MLK_NATIVE_API_H
/*
* Native arithmetic interface
*
* This header is primarily for documentation purposes.
* It should not be included by backend implementations.
*
* To ensure consistency with backends, the header will be
* included automatically after inclusion of the active
* backend, to ensure consistency of function signatures,
* and run sanity checks.
*/
#include <stdint.h>
#include "../cbmc.h"
#include "../common.h"
/* Backends must return MLK_NATIVE_FUNC_SUCCESS upon success. */
#define MLK_NATIVE_FUNC_SUCCESS (0)
/* Backends may return MLK_NATIVE_FUNC_FALLBACK to signal to the frontend that
* the target/parameters are unsupported; typically, this would be because of
* dependencies on CPU features not detected on the host CPU. In this case,
* the frontend falls back to the default C implementation. */
#define MLK_NATIVE_FUNC_FALLBACK (-1)
/* Absolute exclusive upper bound for the output of the inverse NTT
*
* NOTE: This is the same bound as in poly.h and has to be kept
* in sync. */
#define MLK_INVNTT_BOUND (8 * MLKEM_Q)
/* Absolute exclusive upper bound for the output of the forward NTT
*
* NOTE: This is the same bound as in poly.h and has to be kept
* in sync. */
#define MLK_NTT_BOUND (8 * MLKEM_Q)
/*
* This is the C<->native interface allowing for the drop-in of
* native code for performance critical arithmetic components of ML-KEM.
*
* A _backend_ is a specific implementation of (part of) this interface.
*
* To add a function to a backend, define MLK_USE_NATIVE_XXX and
* implement `static inline xxx(...)` in the profile header.
*
* The only exception is MLK_USE_NATIVE_NTT_CUSTOM_ORDER. This option can
* be set if there are native implementations for all of NTT, invNTT, and
* base multiplication, and allows the native implementation to use a
* custom order of polynomial coefficients in NTT domain -- the use of such
* custom order is not an implementation-detail since the public matrix
* is generated in NTT domain. In this case, a permutation function
* mlk_poly_permute_bitrev_to_custom() needs to be provided that permutes
* polynomials in NTT domain from bitreversed to the custom order.
*/
/*
* Those functions are meant to be trivial wrappers around the chosen native
* implementation. The are static inline to avoid unnecessary calls.
* The macro before each declaration controls whether a native
* implementation is present.
*/
#if defined(MLK_USE_NATIVE_NTT)
/*************************************************
* Name: mlk_ntt_native
*
* Description: Computes negacyclic number-theoretic transform (NTT) of
* a polynomial in place.
*
* The input polynomial is assumed to be in normal order.
* The output polynomial is in bitreversed order, or of a
* custom order if MLK_USE_NATIVE_NTT_CUSTOM_ORDER is set.
* See the documentation of MLK_USE_NATIVE_NTT_CUSTOM_ORDER
* for more information.
*
* Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial
**************************************************/
static MLK_INLINE int mlk_ntt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_NTT_BOUND))
ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_NTT */
#if defined(MLK_USE_NATIVE_NTT_CUSTOM_ORDER)
/*
* This must only be set if NTT, invNTT, basemul, mulcache, and
* to/from byte stream conversions all have native implementations
* that are adapted to the custom order.
*/
#if !defined(MLK_USE_NATIVE_NTT) || !defined(MLK_USE_NATIVE_INTT) || \
!defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE) || \
!defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED) || \
!defined(MLK_USE_NATIVE_POLY_TOBYTES) || \
!defined(MLK_USE_NATIVE_POLY_FROMBYTES)
#error \
"Invalid native profile: MLK_USE_NATIVE_NTT_CUSTOM_ORDER can only be \
set if there are native implementations for NTT, invNTT, mulcache, basemul, \
and to/from bytes conversions."
#endif /* !MLK_USE_NATIVE_NTT || !MLK_USE_NATIVE_INTT || \
!MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE || \
!MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED || \
!MLK_USE_NATIVE_POLY_TOBYTES || !MLK_USE_NATIVE_POLY_FROMBYTES */
/*************************************************
* Name: mlk_poly_permute_bitrev_to_custom
*
* Description: When MLK_USE_NATIVE_NTT_CUSTOM_ORDER is defined,
* convert a polynomial in NTT domain from bitreversed
* order to the custom order output by the native NTT.
*
* This must only be defined if there is native code for
* all of (a) NTT, (b) invNTT, (c) basemul, (d) mulcache.
* Arguments: - int16_t p[MLKEM_N]: pointer to in/output polynomial
*
**************************************************/
static MLK_INLINE void mlk_poly_permute_bitrev_to_custom(int16_t p[MLKEM_N])
__contract__(
/* We don't specify that this should be a permutation, but only
* that it does not change the bound established at the end of mlk_gen_matrix. */
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
requires(array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
ensures(array_bound(p, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* MLK_USE_NATIVE_NTT_CUSTOM_ORDER */
#if defined(MLK_USE_NATIVE_INTT)
/*************************************************
* Name: mlk_intt_native
*
* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
* of a polynomial in place.
*
* The input polynomial is in bitreversed order, or of a
* custom order if MLK_USE_NATIVE_NTT_CUSTOM_ORDER is set.
* See the documentation of MLK_USE_NATIVE_NTT_CUSTOM_ORDER
* for more information.
* The output polynomial is assumed to be in normal order.
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
**************************************************/
static MLK_INLINE int mlk_intt_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLK_INVNTT_BOUND))
ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_INTT */
#if defined(MLK_USE_NATIVE_POLY_REDUCE)
/*************************************************
* Name: mlk_poly_reduce_native
*
* Description: Applies modular reduction to all coefficients of a polynomial.
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
static MLK_INLINE int mlk_poly_reduce_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(p, 0, MLKEM_N, 0, MLKEM_Q))
ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
#if defined(MLK_USE_NATIVE_POLY_TOMONT)
/*************************************************
* Name: mlk_poly_tomont_native
*
* Description: Inplace conversion of all coefficients of a polynomial
* from normal domain to Montgomery domain
*
* Arguments: - int16_t r[MLKEM_N]: pointer to input/output polynomial
**************************************************/
static MLK_INLINE int mlk_poly_tomont_native(int16_t p[MLKEM_N])
__contract__(
requires(memory_no_alias(p, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(p, sizeof(int16_t) * MLKEM_N))
ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(p, 0, MLKEM_N, MLKEM_Q))
ensures((return_value == MLK_NATIVE_FUNC_FALLBACK) ==> array_unchanged(p, MLKEM_N))
);
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
#if defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
/*************************************************
* Name: mlk_poly_mulcache_compute_native
*
* Description: Compute multiplication cache for a polynomial
* in NTT domain.
*
* The purpose of the multiplication cache is to
* cache repeated computations required during a
* base multiplication of polynomials in NTT domain.
* The structure of the multiplication-cache is
* implementation defined.
*
* Arguments: INPUT:
* - mlk_poly: const pointer to input polynomial.
* This must be in NTT domain and inin bitreversed order, or of
* a custom order if MLK_USE_NATIVE_NTT_CUSTOM_ORDER is set.
* See the documentation of MLK_USE_NATIVE_NTT_CUSTOM_ORDER
* for more information.
* OUTPUT
* - cache: pointer to multiplication cache
**************************************************/
static MLK_INLINE int mlk_poly_mulcache_compute_native(
int16_t cache[MLKEM_N / 2], const int16_t mlk_poly[MLKEM_N])
__contract__(
requires(memory_no_alias(cache, sizeof(int16_t) * (MLKEM_N / 2)))
requires(memory_no_alias(mlk_poly, sizeof(int16_t) * MLKEM_N))
assigns(object_whole(cache))
ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_abs_bound(cache, 0, MLKEM_N/2, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
#if defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2
/*************************************************
* Name: poly_mulcache_compute_k2_native
*
* Description: Compute scalar product of length-2 polynomial vectors in NTT
* domain.
*
* Arguments: INPUT:
* - a: First polynomial vector operand.
* This must be in NTT domain and in bitreversed order, or of
* a custom order if MLK_USE_NATIVE_NTT_CUSTOM_ORDER is set.
* See the documentation of MLK_USE_NATIVE_NTT_CUSTOM_ORDER
* for more information.
* - b: Second polynomial vector operand.
* As for a.
* - b_cache: Multiplication-cache for b.
* OUTPUT
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(memory_no_alias(a, sizeof(int16_t) * 2 * MLKEM_N))
requires(memory_no_alias(b, sizeof(int16_t) * 2 * MLKEM_N))
requires(memory_no_alias(b_cache, sizeof(int16_t) * 2 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 2 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3
/*************************************************
* Name: poly_mulcache_compute_k3_native
*
* Description: Compute scalar product of length-3 polynomial vectors in NTT
* domain.
*
* Arguments: INPUT:
* - a: First polynomial vector operand.
* This must be in NTT domain and in bitreversed order, or of
* a custom order if MLK_USE_NATIVE_NTT_CUSTOM_ORDER is set.
* See the documentation of MLK_USE_NATIVE_NTT_CUSTOM_ORDER
* for more information.
* - b: Second polynomial vector operand.
* As for a.
* - b_cache: Multiplication-cache for b.
* OUTPUT
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(memory_no_alias(a, sizeof(int16_t) * 3 * MLKEM_N))
requires(memory_no_alias(b, sizeof(int16_t) * 3 * MLKEM_N))
requires(memory_no_alias(b_cache, sizeof(int16_t) * 3 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 3 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
/*************************************************
* Name: poly_mulcache_compute_k4_native
*
* Description: Compute scalar product of length-4 polynomial vectors in NTT
* domain.
*
* Arguments: INPUT:
* - a: First polynomial vector operand.
* This must be in NTT domain and in bitreversed order, or of
* a custom order if MLK_USE_NATIVE_NTT_CUSTOM_ORDER is set.
* See the documentation of MLK_USE_NATIVE_NTT_CUSTOM_ORDER
* for more information.
* - b: Second polynomial vector operand.
* As for a.
* - b_cache: Multiplication-cache for b.
* OUTPUT
* - r: The result of the scalar product. This is again
* in NTT domain, and of the same ordering as a and b.
**************************************************/
static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(memory_no_alias(a, sizeof(int16_t) * 4 * MLKEM_N))
requires(memory_no_alias(b, sizeof(int16_t) * 4 * MLKEM_N))
requires(memory_no_alias(b_cache, sizeof(int16_t) * 4 * (MLKEM_N / 2)))
requires(array_bound(a, 0, 4 * MLKEM_N, 0, MLKEM_UINT12_LIMIT))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || return_value == MLK_NATIVE_FUNC_SUCCESS)
);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
#if defined(MLK_USE_NATIVE_POLY_TOBYTES)
/*************************************************
* Name: mlk_poly_tobytes_native
*
* Description: Serialization of a polynomial.
* Signed coefficients are converted to
* unsigned form before serialization.
*
* Arguments: INPUT:
* - a: const pointer to input polynomial,
* with each coefficient in the range -Q+1 .. Q-1
* OUTPUT
* - r: pointer to output byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
const int16_t a[MLKEM_N])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
requires(array_bound(a, 0, MLKEM_N, 0, MLKEM_Q))
assigns(object_whole(r))
ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
);
#endif /* MLK_USE_NATIVE_POLY_TOBYTES */
#if defined(MLK_USE_NATIVE_POLY_FROMBYTES)
/*************************************************
* Name: mlk_poly_frombytes_native
*
* Description: Serialization of a polynomial.
* Signed coefficients are converted to
* unsigned form before serialization.
*
* Arguments: INPUT:
* - r: pointer to output polynomial in NTT domain
* OUTPUT
* - a: const pointer to input byte array
* (of MLKEM_POLYBYTES bytes)
**************************************************/
static MLK_INLINE int mlk_poly_frombytes_native(
int16_t a[MLKEM_N], const uint8_t r[MLKEM_POLYBYTES])
__contract__(
requires(memory_no_alias(r, MLKEM_POLYBYTES))
requires(memory_no_alias(a, sizeof(int16_t) * MLKEM_N))
assigns(memory_slice(a, sizeof(int16_t) * MLKEM_N))
ensures(return_value == MLK_NATIVE_FUNC_SUCCESS || return_value == MLK_NATIVE_FUNC_FALLBACK)
ensures((return_value == MLK_NATIVE_FUNC_SUCCESS) ==> array_bound(a, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT))
);
#endif /* MLK_USE_NATIVE_POLY_FROMBYTES */
#if defined(MLK_USE_NATIVE_REJ_UNIFORM)
/*************************************************
* Name: mlk_rej_uniform_native
*
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
*
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned len: requested number of 16-bit integers
* (uniform mod q).
* - const uint8_t *buf: pointer to input buffer
* (assumed to be uniform random bytes)
* - unsigned buflen: length of input buffer in bytes.
*
* Return -1 if the native implementation does not support the input lengths.
* Otherwise, returns non-negative number of sampled 16-bit integers (at most
* len).
**************************************************/
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
__contract__(
requires(len <= 4096 && buflen <= 4096 && buflen % 3 == 0)
requires(memory_no_alias(r, sizeof(int16_t) * len))
requires(memory_no_alias(buf, buflen))
assigns(memory_slice(r, sizeof(int16_t) * len))
ensures(return_value == MLK_NATIVE_FUNC_FALLBACK || (0 <= return_value && return_value <= len))
ensures(return_value != -1 ==> array_bound(r, 0, (unsigned) return_value, 0, MLKEM_Q))
);
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D4)
/*************************************************
* Name: mlk_poly_compress_d4_native
*
* Description: Compression (4 bits) and subsequent serialization of a
* polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
* - const int16_t a[MLKEM_N]: pointer to input polynomial
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
static MLK_INLINE int mlk_poly_compress_d4_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N]);
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D10)
/*************************************************
* Name: mlk_poly_compress_d10_native
*
* Description: Compression (10 bits) and subsequent serialization of a
* polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
* - const int16_t a[MLKEM_N]: pointer to input polynomial
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
static MLK_INLINE int mlk_poly_compress_d10_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N]);
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D10 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D4)
/*************************************************
* Name: mlk_poly_decompress_d4
*
* Description: De-serialization and subsequent decompression (dv bits) of a
* polynomial; approximate inverse of poly_compress
*
* Arguments: - int16_t r[MLKEM_N]: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D4 bytes)
*
* Upon return, the coefficients of the output polynomial are unsigned-canonical
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
static MLK_INLINE int mlk_poly_decompress_d4_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D4 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D10)
/*************************************************
* Name: mlk_poly_decompress_d10_native
*
* Description: De-serialization and subsequent decompression (10 bits) of a
* polynomial; approximate inverse of mlk_poly_compress_d10
*
* Arguments: - int16_t r[MLKEM_N]: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D10 bytes)
*
* Upon return, the coefficients of the output polynomial are unsigned-canonical
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
static MLK_INLINE int mlk_poly_decompress_d10_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D10 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D5)
/*************************************************
* Name: mlk_poly_compress_d5_native
*
* Description: Compression (5 bits) and subsequent serialization of a
* polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
* - const int16_t a[MLKEM_N]: pointer to input polynomial
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
static MLK_INLINE int mlk_poly_compress_d5_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N]);
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_COMPRESS_D11)
/*************************************************
* Name: mlk_poly_compress_d11_native
*
* Description: Compression (11 bits) and subsequent serialization of a
* polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
* - const int16_t a[MLKEM_N]: pointer to input polynomial
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
**************************************************/
static MLK_INLINE int mlk_poly_compress_d11_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N]);
#endif /* MLK_USE_NATIVE_POLY_COMPRESS_D11 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D5)
/*************************************************
* Name: mlk_poly_decompress_d5_native
*
* Description: De-serialization and subsequent decompression (dv bits) of a
* polynomial; approximate inverse of poly_compress
*
* Arguments: - int16_t r[MLKEM_N]: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D5 bytes)
*
* Upon return, the coefficients of the output polynomial are unsigned-canonical
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
static MLK_INLINE int mlk_poly_decompress_d5_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D5 */
#if defined(MLK_USE_NATIVE_POLY_DECOMPRESS_D11)
/*************************************************
* Name: mlk_poly_decompress_d11_native
*
* Description: De-serialization and subsequent decompression (11 bits) of a
* polynomial; approximate inverse of mlk_poly_compress_d11
*
* Arguments: - int16_t r[MLKEM_N]: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_D11 bytes)
*
* Upon return, the coefficients of the output polynomial are unsigned-canonical
* (non-negative and smaller than MLKEM_Q).
*
**************************************************/
static MLK_INLINE int mlk_poly_decompress_d11_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
#endif /* MLK_USE_NATIVE_POLY_DECOMPRESS_D11 */
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
#endif /* !MLK_NATIVE_API_H */

View File

@@ -0,0 +1,277 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#ifndef MLK_NATIVE_X86_64_META_H
#define MLK_NATIVE_X86_64_META_H
/* Identifier for this backend so that source and assembly files
* in the build can be appropriately guarded. */
#define MLK_ARITH_BACKEND_X86_64_DEFAULT
#define MLK_USE_NATIVE_NTT_CUSTOM_ORDER
#define MLK_USE_NATIVE_REJ_UNIFORM
#define MLK_USE_NATIVE_NTT
#define MLK_USE_NATIVE_INTT
#define MLK_USE_NATIVE_POLY_REDUCE
#define MLK_USE_NATIVE_POLY_TOMONT
#define MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED
#define MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE
#define MLK_USE_NATIVE_POLY_TOBYTES
#define MLK_USE_NATIVE_POLY_FROMBYTES
#if !defined(__ASSEMBLER__)
#include <string.h>
#include "../../common.h"
#include "../api.h"
#include "src/arith_native_x86_64.h"
static MLK_INLINE void mlk_poly_permute_bitrev_to_custom(int16_t data[MLKEM_N])
{
if (mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
mlk_nttunpack_avx2(data);
}
}
static MLK_INLINE int mlk_rej_uniform_native(int16_t *r, unsigned len,
const uint8_t *buf,
unsigned buflen)
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2) || len != MLKEM_N ||
buflen % 12 != 0)
{
return MLK_NATIVE_FUNC_FALLBACK;
}
return (int)mlk_rej_uniform_asm(r, buf, buflen, mlk_rej_uniform_table);
}
static MLK_INLINE int mlk_ntt_native(int16_t data[MLKEM_N])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_ntt_avx2(data, mlk_qdata);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_intt_native(int16_t data[MLKEM_N])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_invntt_avx2(data, mlk_qdata);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_reduce_native(int16_t data[MLKEM_N])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_reduce_avx2(data, mlk_qdata);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_tomont_native(int16_t data[MLKEM_N])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_tomont_avx2(data, mlk_qdata);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_mulcache_compute_native(int16_t x[MLKEM_N / 2],
const int16_t y[MLKEM_N])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_poly_mulcache_compute_avx2(x, y, mlk_qdata);
return MLK_NATIVE_FUNC_SUCCESS;
}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 2
static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
int16_t r[MLKEM_N], const int16_t a[2 * MLKEM_N],
const int16_t b[2 * MLKEM_N], const int16_t b_cache[2 * (MLKEM_N / 2)])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(r, a, b, b_cache, mlk_qdata);
return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 3
static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
int16_t r[MLKEM_N], const int16_t a[3 * MLKEM_N],
const int16_t b[3 * MLKEM_N], const int16_t b_cache[3 * (MLKEM_N / 2)])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(r, a, b, b_cache, mlk_qdata);
return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
static MLK_INLINE int mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
int16_t r[MLKEM_N], const int16_t a[4 * MLKEM_N],
const int16_t b[4 * MLKEM_N], const int16_t b_cache[4 * (MLKEM_N / 2)])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(r, a, b, b_cache, mlk_qdata);
return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
static MLK_INLINE int mlk_poly_tobytes_native(uint8_t r[MLKEM_POLYBYTES],
const int16_t a[MLKEM_N])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_ntttobytes_avx2(r, a, mlk_qdata);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_frombytes_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYBYTES])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_nttfrombytes_avx2(r, a, mlk_qdata);
return MLK_NATIVE_FUNC_SUCCESS;
}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
static MLK_INLINE int mlk_poly_compress_d4_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4], const int16_t a[MLKEM_N])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_poly_compress_d4_avx2(r, a);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_compress_d10_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10], const int16_t a[MLKEM_N])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_poly_compress_d10_avx2(r, a);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_decompress_d4_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_poly_decompress_d4_avx2(r, a);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_decompress_d10_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_poly_decompress_d10_avx2(r, a);
return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
static MLK_INLINE int mlk_poly_compress_d5_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5], const int16_t a[MLKEM_N])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_poly_compress_d5_avx2(r, a);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_compress_d11_native(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11], const int16_t a[MLKEM_N])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_poly_compress_d11_avx2(r, a);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_decompress_d5_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_poly_decompress_d5_avx2(r, a);
return MLK_NATIVE_FUNC_SUCCESS;
}
static MLK_INLINE int mlk_poly_decompress_d11_native(
int16_t r[MLKEM_N], const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
{
if (!mlk_sys_check_capability(MLK_SYS_CAP_AVX2))
{
return MLK_NATIVE_FUNC_FALLBACK;
}
mlk_poly_decompress_d11_avx2(r, a);
return MLK_NATIVE_FUNC_SUCCESS;
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
#endif /* !__ASSEMBLER__ */
#endif /* !MLK_NATIVE_X86_64_META_H */

View File

@@ -0,0 +1,100 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#ifndef MLK_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H
#define MLK_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H
#include "../../../common.h"
#include <immintrin.h>
#include <stdint.h>
#include "consts.h"
#define MLK_AVX2_REJ_UNIFORM_BUFLEN \
(3 * 168) /* REJ_UNIFORM_NBLOCKS * SHAKE128_RATE */
#define mlk_rej_uniform_asm MLK_NAMESPACE(rej_uniform_asm)
uint64_t mlk_rej_uniform_asm(int16_t *r, const uint8_t *buf, unsigned buflen,
const uint8_t *table);
#define mlk_rej_uniform_table MLK_NAMESPACE(rej_uniform_table)
extern const uint8_t mlk_rej_uniform_table[];
#define mlk_ntt_avx2 MLK_NAMESPACE(ntt_avx2)
void mlk_ntt_avx2(int16_t *r, const int16_t *mlk_qdata);
#define mlk_invntt_avx2 MLK_NAMESPACE(invntt_avx2)
void mlk_invntt_avx2(int16_t *r, const int16_t *mlk_qdata);
#define mlk_nttunpack_avx2 MLK_NAMESPACE(nttunpack_avx2)
void mlk_nttunpack_avx2(int16_t *r);
#define mlk_reduce_avx2 MLK_NAMESPACE(reduce_avx2)
void mlk_reduce_avx2(int16_t *r, const int16_t *mlk_qdata);
#define mlk_poly_mulcache_compute_avx2 MLK_NAMESPACE(poly_mulcache_compute_avx2)
void mlk_poly_mulcache_compute_avx2(int16_t *out, const int16_t *in,
const int16_t *mlk_qdata);
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k2 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k2)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k2(int16_t *r,
const int16_t *a,
const int16_t *b,
const int16_t *b_cache,
const int16_t *qdata);
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k3 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k3)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k3(int16_t *r,
const int16_t *a,
const int16_t *b,
const int16_t *b_cache,
const int16_t *qdata);
#define mlk_polyvec_basemul_acc_montgomery_cached_asm_k4 \
MLK_NAMESPACE(polyvec_basemul_acc_montgomery_cached_asm_k4)
void mlk_polyvec_basemul_acc_montgomery_cached_asm_k4(int16_t *r,
const int16_t *a,
const int16_t *b,
const int16_t *b_cache,
const int16_t *qdata);
#define mlk_ntttobytes_avx2 MLK_NAMESPACE(ntttobytes_avx2)
void mlk_ntttobytes_avx2(uint8_t *r, const int16_t *a,
const int16_t *mlk_qdata);
#define mlk_nttfrombytes_avx2 MLK_NAMESPACE(nttfrombytes_avx2)
void mlk_nttfrombytes_avx2(int16_t *r, const uint8_t *a,
const int16_t *mlk_qdata);
#define mlk_tomont_avx2 MLK_NAMESPACE(tomont_avx2)
void mlk_tomont_avx2(int16_t *r, const int16_t *mlk_qdata);
#define mlk_poly_compress_d4_avx2 MLK_NAMESPACE(poly_compress_d4_avx2)
void mlk_poly_compress_d4_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
const int16_t *MLK_RESTRICT a);
#define mlk_poly_decompress_d4_avx2 MLK_NAMESPACE(poly_decompress_d4_avx2)
void mlk_poly_decompress_d4_avx2(int16_t *MLK_RESTRICT r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4]);
#define mlk_poly_compress_d10_avx2 MLK_NAMESPACE(poly_compress10_avx2)
void mlk_poly_compress_d10_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
const int16_t *MLK_RESTRICT a);
#define mlk_poly_decompress_d10_avx2 MLK_NAMESPACE(poly_decompress10_avx2)
void mlk_poly_decompress_d10_avx2(
int16_t *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10]);
#define mlk_poly_compress_d5_avx2 MLK_NAMESPACE(poly_compress_d5_avx2)
void mlk_poly_compress_d5_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
const int16_t *MLK_RESTRICT a);
#define mlk_poly_decompress_d5_avx2 MLK_NAMESPACE(poly_decompress_d5_avx2)
void mlk_poly_decompress_d5_avx2(int16_t *MLK_RESTRICT r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5]);
#define mlk_poly_compress_d11_avx2 MLK_NAMESPACE(poly_compress11_avx2)
void mlk_poly_compress_d11_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
const int16_t *MLK_RESTRICT a);
#define mlk_poly_decompress_d11_avx2 MLK_NAMESPACE(poly_decompress11_avx2)
void mlk_poly_decompress_d11_avx2(
int16_t *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11]);
#endif /* !MLK_NATIVE_X86_64_SRC_ARITH_NATIVE_X86_64_H */

View File

@@ -0,0 +1,387 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Kyber optimized AVX2 implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2].
*/
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include <immintrin.h>
#include <stdint.h>
#include <string.h>
#include "arith_native_x86_64.h"
#include "consts.h"
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || (MLKEM_K == 2 || MLKEM_K == 3)
void mlk_poly_compress_d4_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D4],
const int16_t *MLK_RESTRICT a)
{
unsigned int i;
__m256i f0, f1, f2, f3;
const __m256i v = _mm256_load_si256(
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XV]);
const __m256i shift1 = _mm256_set1_epi16(1 << 9);
const __m256i mask = _mm256_set1_epi16(15);
const __m256i shift2 = _mm256_set1_epi16((16 << 8) + 1);
const __m256i permdidx = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
for (i = 0; i < MLKEM_N / 64; i++)
{
f0 = _mm256_load_si256((__m256i *)&a[64 * i + 16 * 0]);
f1 = _mm256_load_si256((__m256i *)&a[64 * i + 16 * 1]);
f2 = _mm256_load_si256((__m256i *)&a[64 * i + 16 * 2]);
f3 = _mm256_load_si256((__m256i *)&a[64 * i + 16 * 3]);
f0 = _mm256_mulhi_epi16(f0, v);
f1 = _mm256_mulhi_epi16(f1, v);
f2 = _mm256_mulhi_epi16(f2, v);
f3 = _mm256_mulhi_epi16(f3, v);
f0 = _mm256_mulhrs_epi16(f0, shift1);
f1 = _mm256_mulhrs_epi16(f1, shift1);
f2 = _mm256_mulhrs_epi16(f2, shift1);
f3 = _mm256_mulhrs_epi16(f3, shift1);
f0 = _mm256_and_si256(f0, mask);
f1 = _mm256_and_si256(f1, mask);
f2 = _mm256_and_si256(f2, mask);
f3 = _mm256_and_si256(f3, mask);
f0 = _mm256_packus_epi16(f0, f1);
f2 = _mm256_packus_epi16(f2, f3);
f0 = _mm256_maddubs_epi16(f0, shift2);
f2 = _mm256_maddubs_epi16(f2, shift2);
f0 = _mm256_packus_epi16(f0, f2);
f0 = _mm256_permutevar8x32_epi32(f0, permdidx);
_mm256_storeu_si256((__m256i *)&r[32 * i], f0);
}
}
void mlk_poly_decompress_d4_avx2(int16_t *MLK_RESTRICT r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D4])
{
unsigned int i;
__m128i t;
__m256i f;
const __m256i q = _mm256_load_si256(
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ]);
const __m256i shufbidx =
_mm256_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3,
3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
const __m256i mask = _mm256_set1_epi32(0x00F0000F);
const __m256i shift = _mm256_set1_epi32((128 << 16) + 2048);
for (i = 0; i < MLKEM_N / 16; i++)
{
t = _mm_loadl_epi64((__m128i *)&a[8 * i]);
f = _mm256_broadcastsi128_si256(t);
f = _mm256_shuffle_epi8(f, shufbidx);
f = _mm256_and_si256(f, mask);
f = _mm256_mullo_epi16(f, shift);
f = _mm256_mulhrs_epi16(f, q);
_mm256_storeu_si256((__m256i *)&r[16 * i], f);
}
}
void mlk_poly_compress_d10_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D10],
const int16_t *MLK_RESTRICT a)
{
unsigned int i;
__m256i f0, f1, f2;
__m128i t0, t1;
const __m256i v = _mm256_load_si256(
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XV]);
const __m256i v8 = _mm256_slli_epi16(v, 3);
const __m256i off = _mm256_set1_epi16(15);
const __m256i shift1 = _mm256_set1_epi16(1 << 12);
const __m256i mask = _mm256_set1_epi16(1023);
const __m256i shift2 =
_mm256_set1_epi64x((1024LL << 48) + (1LL << 32) + (1024 << 16) + 1);
const __m256i sllvdidx = _mm256_set1_epi64x(12);
const __m256i shufbidx =
_mm256_set_epi8(8, 4, 3, 2, 1, 0, -1, -1, -1, -1, -1, -1, 12, 11, 10, 9,
-1, -1, -1, -1, -1, -1, 12, 11, 10, 9, 8, 4, 3, 2, 1, 0);
for (i = 0; i < MLKEM_N / 16; i++)
{
f0 = _mm256_load_si256((__m256i *)&a[16 * i]);
f1 = _mm256_mullo_epi16(f0, v8);
f2 = _mm256_add_epi16(f0, off);
f0 = _mm256_slli_epi16(f0, 3);
f0 = _mm256_mulhi_epi16(f0, v);
f2 = _mm256_sub_epi16(f1, f2);
f1 = _mm256_andnot_si256(f1, f2);
f1 = _mm256_srli_epi16(f1, 15);
f0 = _mm256_sub_epi16(f0, f1);
f0 = _mm256_mulhrs_epi16(f0, shift1);
f0 = _mm256_and_si256(f0, mask);
f0 = _mm256_madd_epi16(f0, shift2);
f0 = _mm256_sllv_epi32(f0, sllvdidx);
f0 = _mm256_srli_epi64(f0, 12);
f0 = _mm256_shuffle_epi8(f0, shufbidx);
t0 = _mm256_castsi256_si128(f0);
t1 = _mm256_extracti128_si256(f0, 1);
t0 = _mm_blend_epi16(t0, t1, 0xE0);
_mm_storeu_si128((__m128i *)&r[20 * i + 0], t0);
mlk_memcpy(&r[20 * i + 16], &t1, 4);
}
}
void mlk_poly_decompress_d10_avx2(
int16_t *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D10])
{
unsigned int i;
__m256i f;
const __m256i q = _mm256_set1_epi32((MLKEM_Q << 16) + 4 * MLKEM_Q);
const __m256i shufbidx =
_mm256_set_epi8(11, 10, 10, 9, 9, 8, 8, 7, 6, 5, 5, 4, 4, 3, 3, 2, 9, 8,
8, 7, 7, 6, 6, 5, 4, 3, 3, 2, 2, 1, 1, 0);
const __m256i sllvdidx = _mm256_set1_epi64x(4);
/* TODO: Explain magic values */
/* check-magic: off */
const __m256i mask = _mm256_set1_epi32((32736 << 16) + 8184);
/* check-magic: on */
for (i = 0; i < (MLKEM_N / 16) - 1; i++)
{
f = _mm256_loadu_si256((__m256i *)&a[20 * i]);
f = _mm256_permute4x64_epi64(f, 0x94);
f = _mm256_shuffle_epi8(f, shufbidx);
f = _mm256_sllv_epi32(f, sllvdidx);
f = _mm256_srli_epi16(f, 1);
f = _mm256_and_si256(f, mask);
f = _mm256_mulhrs_epi16(f, q);
_mm256_storeu_si256((__m256i *)&r[16 * i], f);
}
/* Handle load in last iteration especially to avoid buffer overflow */
mlk_memcpy(&f, &a[20 * i], 20);
/* The rest is the same */
f = _mm256_permute4x64_epi64(f, 0x94);
f = _mm256_shuffle_epi8(f, shufbidx);
f = _mm256_sllv_epi32(f, sllvdidx);
f = _mm256_srli_epi16(f, 1);
f = _mm256_and_si256(f, mask);
f = _mm256_mulhrs_epi16(f, q);
_mm256_storeu_si256((__m256i *)&r[16 * i], f);
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 2 || MLKEM_K == 3 */
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_K == 4
void mlk_poly_compress_d5_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D5],
const int16_t *MLK_RESTRICT a)
{
unsigned int i;
__m256i f0, f1;
__m128i t0, t1;
const __m256i v = _mm256_load_si256(
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XV]);
const __m256i shift1 = _mm256_set1_epi16(1 << 10);
const __m256i mask = _mm256_set1_epi16(31);
const __m256i shift2 = _mm256_set1_epi16((32 << 8) + 1);
const __m256i shift3 = _mm256_set1_epi32((1024 << 16) + 1);
const __m256i sllvdidx = _mm256_set1_epi64x(12);
const __m256i shufbidx =
_mm256_set_epi8(8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0, -1, 12, 11, 10, 9,
-1, 12, 11, 10, 9, 8, -1, -1, -1, -1, -1, 4, 3, 2, 1, 0);
for (i = 0; i < MLKEM_N / 32; i++)
{
f0 = _mm256_load_si256((__m256i *)&a[32 * i + 16 * 0]);
f1 = _mm256_load_si256((__m256i *)&a[32 * i + 16 * 1]);
f0 = _mm256_mulhi_epi16(f0, v);
f1 = _mm256_mulhi_epi16(f1, v);
f0 = _mm256_mulhrs_epi16(f0, shift1);
f1 = _mm256_mulhrs_epi16(f1, shift1);
f0 = _mm256_and_si256(f0, mask);
f1 = _mm256_and_si256(f1, mask);
f0 = _mm256_packus_epi16(f0, f1);
f0 = _mm256_maddubs_epi16(
f0, shift2); /* a0 a1 a2 a3 b0 b1 b2 b3 a4 a5 a6 a7 b4 b5 b6 b7 */
f0 = _mm256_madd_epi16(f0, shift3); /* a0 a1 b0 b1 a2 a3 b2 b3 */
f0 = _mm256_sllv_epi32(f0, sllvdidx);
f0 = _mm256_srlv_epi64(f0, sllvdidx);
f0 = _mm256_shuffle_epi8(f0, shufbidx);
t0 = _mm256_castsi256_si128(f0);
t1 = _mm256_extracti128_si256(f0, 1);
t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
_mm_storeu_si128((__m128i *)&r[20 * i + 0], t0);
mlk_memcpy(&r[20 * i + 16], &t1, 4);
}
}
void mlk_poly_decompress_d5_avx2(int16_t *MLK_RESTRICT r,
const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D5])
{
unsigned int i;
__m128i t;
__m256i f;
int16_t ti;
const __m256i q = _mm256_load_si256(
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ]);
const __m256i shufbidx =
_mm256_set_epi8(9, 9, 9, 8, 8, 8, 8, 7, 7, 6, 6, 6, 6, 5, 5, 5, 4, 4, 4,
3, 3, 3, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0);
/* TODO: Document those magic values */
/* check-magic: off */
const __m256i mask = _mm256_set_epi16(248, 1984, 62, 496, 3968, 124, 992, 31,
248, 1984, 62, 496, 3968, 124, 992, 31);
const __m256i shift = _mm256_set_epi16(128, 16, 512, 64, 8, 256, 32, 1024,
128, 16, 512, 64, 8, 256, 32, 1024);
/* check-magic: on */
for (i = 0; i < MLKEM_N / 16; i++)
{
t = _mm_loadl_epi64((__m128i *)&a[10 * i + 0]);
mlk_memcpy(&ti, &a[10 * i + 8], 2);
t = _mm_insert_epi16(t, ti, 4);
f = _mm256_broadcastsi128_si256(t);
f = _mm256_shuffle_epi8(f, shufbidx);
f = _mm256_and_si256(f, mask);
f = _mm256_mullo_epi16(f, shift);
f = _mm256_mulhrs_epi16(f, q);
_mm256_storeu_si256((__m256i *)&r[16 * i], f);
}
}
void mlk_poly_compress_d11_avx2(uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_D11],
const int16_t *MLK_RESTRICT a)
{
unsigned int i;
__m256i f0, f1, f2;
__m128i t0, t1;
const __m256i v = _mm256_load_si256(
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XV]);
const __m256i v8 = _mm256_slli_epi16(v, 3);
const __m256i off = _mm256_set1_epi16(36);
const __m256i shift1 = _mm256_set1_epi16(1 << 13);
const __m256i mask = _mm256_set1_epi16(2047);
const __m256i shift2 =
_mm256_set1_epi64x((2048LL << 48) + (1LL << 32) + (2048 << 16) + 1);
const __m256i sllvdidx = _mm256_set1_epi64x(10);
const __m256i srlvqidx = _mm256_set_epi64x(30, 10, 30, 10);
const __m256i shufbidx =
_mm256_set_epi8(4, 3, 2, 1, 0, 0, -1, -1, -1, -1, 10, 9, 8, 7, 6, 5, -1,
-1, -1, -1, -1, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
for (i = 0; i < (MLKEM_N / 16) - 1; i++)
{
f0 = _mm256_load_si256((__m256i *)&a[16 * i]);
f1 = _mm256_mullo_epi16(f0, v8);
f2 = _mm256_add_epi16(f0, off);
f0 = _mm256_slli_epi16(f0, 3);
f0 = _mm256_mulhi_epi16(f0, v);
f2 = _mm256_sub_epi16(f1, f2);
f1 = _mm256_andnot_si256(f1, f2);
f1 = _mm256_srli_epi16(f1, 15);
f0 = _mm256_sub_epi16(f0, f1);
f0 = _mm256_mulhrs_epi16(f0, shift1);
f0 = _mm256_and_si256(f0, mask);
f0 = _mm256_madd_epi16(f0, shift2);
f0 = _mm256_sllv_epi32(f0, sllvdidx);
f1 = _mm256_bsrli_epi128(f0, 8);
f0 = _mm256_srlv_epi64(f0, srlvqidx);
f1 = _mm256_slli_epi64(f1, 34);
f0 = _mm256_add_epi64(f0, f1);
f0 = _mm256_shuffle_epi8(f0, shufbidx);
t0 = _mm256_castsi256_si128(f0);
t1 = _mm256_extracti128_si256(f0, 1);
t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
_mm_storeu_si128((__m128i *)&r[22 * i + 0], t0);
_mm_storel_epi64((__m128i *)&r[22 * i + 16], t1);
}
f0 = _mm256_load_si256((__m256i *)&a[16 * i]);
f1 = _mm256_mullo_epi16(f0, v8);
f2 = _mm256_add_epi16(f0, off);
f0 = _mm256_slli_epi16(f0, 3);
f0 = _mm256_mulhi_epi16(f0, v);
f2 = _mm256_sub_epi16(f1, f2);
f1 = _mm256_andnot_si256(f1, f2);
f1 = _mm256_srli_epi16(f1, 15);
f0 = _mm256_sub_epi16(f0, f1);
f0 = _mm256_mulhrs_epi16(f0, shift1);
f0 = _mm256_and_si256(f0, mask);
f0 = _mm256_madd_epi16(f0, shift2);
f0 = _mm256_sllv_epi32(f0, sllvdidx);
f1 = _mm256_bsrli_epi128(f0, 8);
f0 = _mm256_srlv_epi64(f0, srlvqidx);
f1 = _mm256_slli_epi64(f1, 34);
f0 = _mm256_add_epi64(f0, f1);
f0 = _mm256_shuffle_epi8(f0, shufbidx);
t0 = _mm256_castsi256_si128(f0);
t1 = _mm256_extracti128_si256(f0, 1);
t0 = _mm_blendv_epi8(t0, t1, _mm256_castsi256_si128(shufbidx));
_mm_storeu_si128((__m128i *)&r[22 * i + 0], t0);
/* Handle store in last iteration especially to avoid overflow */
mlk_memcpy(&r[22 * i + 16], &t1, 6);
}
void mlk_poly_decompress_d11_avx2(
int16_t *MLK_RESTRICT r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_D11])
{
unsigned int i;
__m256i f;
const __m256i q = _mm256_load_si256(
(__m256i *)&mlk_qdata[MLK_AVX2_BACKEND_DATA_OFFSET_16XQ]);
const __m256i shufbidx =
_mm256_set_epi8(13, 12, 12, 11, 10, 9, 9, 8, 8, 7, 6, 5, 5, 4, 4, 3, 10,
9, 9, 8, 7, 6, 6, 5, 5, 4, 3, 2, 2, 1, 1, 0);
const __m256i srlvdidx = _mm256_set_epi32(0, 0, 1, 0, 0, 0, 1, 0);
const __m256i srlvqidx = _mm256_set_epi64x(2, 0, 2, 0);
const __m256i shift =
_mm256_set_epi16(4, 32, 1, 8, 32, 1, 4, 32, 4, 32, 1, 8, 32, 1, 4, 32);
/* TODO: Explain magic constant */
/* check-magic: off */
const __m256i mask = _mm256_set1_epi16(32752);
/* check-magic: on */
for (i = 0; i < (MLKEM_N / 16) - 1; i++)
{
f = _mm256_loadu_si256((__m256i *)&a[22 * i]);
f = _mm256_permute4x64_epi64(f, 0x94);
f = _mm256_shuffle_epi8(f, shufbidx);
f = _mm256_srlv_epi32(f, srlvdidx);
f = _mm256_srlv_epi64(f, srlvqidx);
f = _mm256_mullo_epi16(f, shift);
f = _mm256_srli_epi16(f, 1);
f = _mm256_and_si256(f, mask);
f = _mm256_mulhrs_epi16(f, q);
_mm256_storeu_si256((__m256i *)&r[16 * i], f);
}
/* Handle load of last iteration especially */
mlk_memcpy(&f, &a[22 * i], 22);
/* The rest of the iteration is the same */
f = _mm256_permute4x64_epi64(f, 0x94);
f = _mm256_shuffle_epi8(f, shufbidx);
f = _mm256_srlv_epi32(f, srlvdidx);
f = _mm256_srlv_epi64(f, srlvqidx);
f = _mm256_mullo_epi16(f, shift);
f = _mm256_srli_epi16(f, 1);
f = _mm256_and_si256(f, mask);
f = _mm256_mulhrs_epi16(f, q);
_mm256_storeu_si256((__m256i *)&r[16 * i], f);
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_K == 4 */
#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
MLK_EMPTY_CU(avx2_poly_compress)
#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
!MLK_CONFIG_MULTILEVEL_NO_SHARED) */

View File

@@ -0,0 +1,286 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Kyber optimized AVX2 implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2].
*/
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include "consts.h"
#define MLK_AVX2_Q MLKEM_Q
/* check-magic: -1044 == pow(2,16,MLKEM_Q) */
#define MLK_AVX2_MONT -1044
/* check-magic: -3327 == pow(MLKEM_Q,-1,2^16) */
#define MLK_AVX2_QINV -3327
/* check-magic: 20159 == round(2^26/MLKEM_Q) */
#define MLK_AVX2_V 20159
/* check-magic: 1441 == pow(2,32-7,MLKEM_Q) */
#define MLK_AVX2_FHI 1441
/* check-magic: -10079 == signed_mod(MLK_AVX2_QINV*MLK_AVX2_FHI,2^16) */
#define MLK_AVX2_FLO -10079
/* check-magic: 1353 == pow(2, 32, MLKEM_Q) */
#define MLK_AVX2_MONTSQHI 1353
/* check-magic: 20553 == signed_mod(MLK_AVX2_QINV*MLK_AVX2_MONTSQHI,2^16) */
#define MLK_AVX2_MONTSQLO 20553
#define MLK_AVX2_MASK 4095
#define MLK_AVX2_SHIFT 32
MLK_ALIGN const int16_t mlk_qdata[768] = {
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQ 0
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
MLK_AVX2_Q,
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQINV 16
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
MLK_AVX2_QINV,
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XV 32
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
MLK_AVX2_V,
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFLO 48
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
MLK_AVX2_FLO,
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFHI 64
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
MLK_AVX2_FHI,
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO 80
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
MLK_AVX2_MONTSQLO,
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI 96
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
MLK_AVX2_MONTSQHI,
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK 112
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
MLK_AVX2_MASK,
#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB 128
/* TODO: Explain these numbers */
/* check-magic: off */
3854,
3340,
2826,
2312,
1798,
1284,
770,
256,
3854,
3340,
2826,
2312,
1798,
1284,
770,
256,
/* check-magic: on */
#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD 144
7,
0,
6,
0,
5,
0,
4,
0,
3,
0,
2,
0,
1,
0,
0,
0,
#define MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP 160
#include "x86_64_zetas.i"
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XSHIFT 624
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
MLK_AVX2_SHIFT,
#define MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES 640
#include "x86_64_mulcache_twiddles.i"
};
#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
MLK_EMPTY_CU(avx2_consts)
#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
!MLK_CONFIG_MULTILEVEL_NO_SHARED) */
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef MLK_AVX2_Q
#undef MLK_AVX2_MONT
#undef MLK_AVX2_QINV
#undef MLK_AVX2_V
#undef MLK_AVX2_FHI
#undef MLK_AVX2_FLO
#undef MLK_AVX2_MONTSQHI
#undef MLK_AVX2_MONTSQLO
#undef MLK_AVX2_MASK
#undef MLK_AVX2_SHIFT
/* Some macros are kept because they are also defined in a header. */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XQ (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XQINV (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XV (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XFLO (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XFHI (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_16XSHIFT (consts.h) */
/* Keep: MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES (consts.h) */

View File

@@ -0,0 +1,43 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Kyber optimized AVX2 implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2].
*/
#ifndef MLK_NATIVE_X86_64_SRC_CONSTS_H
#define MLK_NATIVE_X86_64_SRC_CONSTS_H
#include "../../../common.h"
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQ 0
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XQINV 16
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XV 32
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFLO 48
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XFHI 64
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQLO 80
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMONTSQHI 96
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XMASK 112
#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXB 128
#define MLK_AVX2_BACKEND_DATA_OFFSET_REVIDXD 144
#define MLK_AVX2_BACKEND_DATA_OFFSET_ZETAS_EXP 160
#define MLK_AVX2_BACKEND_DATA_OFFSET_16XSHIFT 624
#define MLK_AVX2_BACKEND_DATA_OFFSET_MULCACHE_TWIDDLES 640
#ifndef __ASSEMBLER__
#define mlk_qdata MLK_NAMESPACE(qdata)
extern const int16_t mlk_qdata[768];
#endif
#endif /* !MLK_NATIVE_X86_64_SRC_CONSTS_H */

View File

@@ -0,0 +1,42 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2].
*/
.macro red16 r,rs=0,x=12
vpmulhw %ymm1,%ymm\r,%ymm\x
.if \rs
vpmulhrsw %ymm\rs,%ymm\x,%ymm\x
.else
vpsraw $10,%ymm\x,%ymm\x
.endif
vpmullw %ymm0,%ymm\x,%ymm\x
vpsubw %ymm\x,%ymm\r,%ymm\r
.endm
.macro csubq r,x=12
vpsubw %ymm0,%ymm\r,%ymm\r
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
.endm
.macro caddq r,x=12
vpsraw $15,%ymm\r,%ymm\x
vpand %ymm0,%ymm\x,%ymm\x
vpaddw %ymm\x,%ymm\r,%ymm\r
.endm
/* Montgomery multiplication between b and ah,
* with Montgomery twist of ah in al. */
.macro fqmulprecomp al,ah,b,x=12
vpmullw %ymm\al,%ymm\b,%ymm\x
vpmulhw %ymm\ah,%ymm\b,%ymm\b
vpmulhw %ymm0,%ymm\x,%ymm\x
vpsubw %ymm\x,%ymm\b,%ymm\b
.endm

View File

@@ -0,0 +1,697 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [AVX2_NTT]
* Faster AVX2 optimized NTT multiplication for Ring-LWE lattice cryptography.
* Gregor Seiler
* https://eprint.iacr.org/2018/039
*
* - [REF_AVX2]
* CRYSTALS-Kyber optimized AVX2 implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2].
*
* The core ideas behind the implementation are described in @[AVX2_NTT].
*
* Changes:
* - Different placement of modular reductions to simplify
* reasoning of non-overflow
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/intt.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_invntt_avx2)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_invntt_avx2)
S2N_BN_SYMBOL(mlkem_invntt_avx2):
.cfi_startproc
vmovdqa (%rsi), %ymm0
vmovdqa 0x60(%rsi), %ymm2
vmovdqa 0x80(%rsi), %ymm3
vmovdqa (%rdi), %ymm4
vmovdqa 0x40(%rdi), %ymm6
vmovdqa 0x20(%rdi), %ymm5
vmovdqa 0x60(%rdi), %ymm7
vpmullw %ymm2, %ymm4, %ymm12
vpmulhw %ymm3, %ymm4, %ymm4
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm4, %ymm4
vpmullw %ymm2, %ymm6, %ymm12
vpmulhw %ymm3, %ymm6, %ymm6
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm6, %ymm6
vpmullw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm5, %ymm5
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm5, %ymm5
vpmullw %ymm2, %ymm7, %ymm12
vpmulhw %ymm3, %ymm7, %ymm7
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm7, %ymm7
vmovdqa 0x80(%rdi), %ymm8
vmovdqa 0xc0(%rdi), %ymm10
vmovdqa 0xa0(%rdi), %ymm9
vmovdqa 0xe0(%rdi), %ymm11
vpmullw %ymm2, %ymm8, %ymm12
vpmulhw %ymm3, %ymm8, %ymm8
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm8, %ymm8
vpmullw %ymm2, %ymm10, %ymm12
vpmulhw %ymm3, %ymm10, %ymm10
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm10, %ymm10
vpmullw %ymm2, %ymm9, %ymm12
vpmulhw %ymm3, %ymm9, %ymm9
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm9, %ymm9
vpmullw %ymm2, %ymm11, %ymm12
vpmulhw %ymm3, %ymm11, %ymm11
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm11, %ymm11
vpermq $0x4e, 0x4a0(%rsi), %ymm15 # ymm15 = mem[2,3,0,1]
vpermq $0x4e, 0x460(%rsi), %ymm1 # ymm1 = mem[2,3,0,1]
vpermq $0x4e, 0x4c0(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
vpermq $0x4e, 0x480(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
vmovdqa 0x100(%rsi), %ymm12
vpshufb %ymm12, %ymm15, %ymm15
vpshufb %ymm12, %ymm1, %ymm1
vpshufb %ymm12, %ymm2, %ymm2
vpshufb %ymm12, %ymm3, %ymm3
vpsubw %ymm4, %ymm6, %ymm12
vpaddw %ymm6, %ymm4, %ymm4
vpsubw %ymm5, %ymm7, %ymm13
vpmullw %ymm15, %ymm12, %ymm6
vpaddw %ymm7, %ymm5, %ymm5
vpsubw %ymm8, %ymm10, %ymm14
vpmullw %ymm15, %ymm13, %ymm7
vpaddw %ymm10, %ymm8, %ymm8
vpsubw %ymm9, %ymm11, %ymm15
vpmullw %ymm1, %ymm14, %ymm10
vpaddw %ymm11, %ymm9, %ymm9
vpmullw %ymm1, %ymm15, %ymm11
vpmulhw %ymm2, %ymm12, %ymm12
vpmulhw %ymm2, %ymm13, %ymm13
vpmulhw %ymm3, %ymm14, %ymm14
vpmulhw %ymm3, %ymm15, %ymm15
vpmulhw %ymm0, %ymm6, %ymm6
vpmulhw %ymm0, %ymm7, %ymm7
vpmulhw %ymm0, %ymm10, %ymm10
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm6, %ymm12, %ymm6
vpsubw %ymm7, %ymm13, %ymm7
vpsubw %ymm10, %ymm14, %ymm10
vpsubw %ymm11, %ymm15, %ymm11
vpermq $0x4e, 0x420(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
vpermq $0x4e, 0x440(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
vmovdqa 0x100(%rsi), %ymm1
vpshufb %ymm1, %ymm2, %ymm2
vpshufb %ymm1, %ymm3, %ymm3
vpsubw %ymm4, %ymm8, %ymm12
vpaddw %ymm8, %ymm4, %ymm4
vpsubw %ymm5, %ymm9, %ymm13
vpmullw %ymm2, %ymm12, %ymm8
vpaddw %ymm9, %ymm5, %ymm5
vpsubw %ymm6, %ymm10, %ymm14
vpmullw %ymm2, %ymm13, %ymm9
vpaddw %ymm10, %ymm6, %ymm6
vpsubw %ymm7, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm10
vpaddw %ymm11, %ymm7, %ymm7
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm3, %ymm12, %ymm12
vpmulhw %ymm3, %ymm13, %ymm13
vpmulhw %ymm3, %ymm14, %ymm14
vpmulhw %ymm3, %ymm15, %ymm15
vpmulhw %ymm0, %ymm8, %ymm8
vpmulhw %ymm0, %ymm9, %ymm9
vpmulhw %ymm0, %ymm10, %ymm10
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm8, %ymm12, %ymm8
vpsubw %ymm9, %ymm13, %ymm9
vpsubw %ymm10, %ymm14, %ymm10
vpsubw %ymm11, %ymm15, %ymm11
vpslld $0x10, %ymm5, %ymm3
vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
vpsrld $0x10, %ymm4, %ymm4
vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
vpslld $0x10, %ymm7, %ymm4
vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
vpsrld $0x10, %ymm6, %ymm6
vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
vpslld $0x10, %ymm9, %ymm6
vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
vpsrld $0x10, %ymm8, %ymm8
vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
vpslld $0x10, %ymm11, %ymm8
vpblendw $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7],ymm10[8],ymm8[9],ymm10[10],ymm8[11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
vpsrld $0x10, %ymm10, %ymm10
vpblendw $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7],ymm10[8],ymm11[9],ymm10[10],ymm11[11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
vmovdqa 0x120(%rsi), %ymm12
vpermd 0x3e0(%rsi), %ymm12, %ymm2
vpermd 0x400(%rsi), %ymm12, %ymm10
vpsubw %ymm3, %ymm5, %ymm12
vpaddw %ymm5, %ymm3, %ymm3
vpsubw %ymm4, %ymm7, %ymm13
vpmullw %ymm2, %ymm12, %ymm5
vpaddw %ymm7, %ymm4, %ymm4
vpsubw %ymm6, %ymm9, %ymm14
vpmullw %ymm2, %ymm13, %ymm7
vpaddw %ymm9, %ymm6, %ymm6
vpsubw %ymm8, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm9
vpaddw %ymm11, %ymm8, %ymm8
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm10, %ymm12, %ymm12
vpmulhw %ymm10, %ymm13, %ymm13
vpmulhw %ymm10, %ymm14, %ymm14
vpmulhw %ymm10, %ymm15, %ymm15
vpmulhw %ymm0, %ymm5, %ymm5
vpmulhw %ymm0, %ymm7, %ymm7
vpmulhw %ymm0, %ymm9, %ymm9
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm5, %ymm12, %ymm5
vpsubw %ymm7, %ymm13, %ymm7
vpsubw %ymm9, %ymm14, %ymm9
vpsubw %ymm11, %ymm15, %ymm11
vmovdqa 0x40(%rsi), %ymm1
vpmulhw %ymm1, %ymm3, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm3, %ymm3
vmovsldup %ymm4, %ymm10 # ymm10 = ymm4[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7]
vpsrlq $0x20, %ymm3, %ymm3
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
vmovsldup %ymm8, %ymm3 # ymm3 = ymm8[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
vpsrlq $0x20, %ymm6, %ymm6
vpblendd $0xaa, %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7]
vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
vpsrlq $0x20, %ymm5, %ymm5
vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
vmovsldup %ymm11, %ymm5 # ymm5 = ymm11[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
vpsrlq $0x20, %ymm9, %ymm9
vpblendd $0xaa, %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4],ymm11[5],ymm9[6],ymm11[7]
vpermq $0x1b, 0x3a0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0]
vpermq $0x1b, 0x3c0(%rsi), %ymm9 # ymm9 = mem[3,2,1,0]
vpsubw %ymm10, %ymm4, %ymm12
vpaddw %ymm4, %ymm10, %ymm10
vpsubw %ymm3, %ymm8, %ymm13
vpmullw %ymm2, %ymm12, %ymm4
vpaddw %ymm8, %ymm3, %ymm3
vpsubw %ymm6, %ymm7, %ymm14
vpmullw %ymm2, %ymm13, %ymm8
vpaddw %ymm7, %ymm6, %ymm6
vpsubw %ymm5, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm7
vpaddw %ymm11, %ymm5, %ymm5
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm9, %ymm12, %ymm12
vpmulhw %ymm9, %ymm13, %ymm13
vpmulhw %ymm9, %ymm14, %ymm14
vpmulhw %ymm9, %ymm15, %ymm15
vpmulhw %ymm0, %ymm4, %ymm4
vpmulhw %ymm0, %ymm8, %ymm8
vpmulhw %ymm0, %ymm7, %ymm7
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm4, %ymm12, %ymm4
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm7, %ymm14, %ymm7
vpsubw %ymm11, %ymm15, %ymm11
vpmulhw %ymm1, %ymm10, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm10, %ymm10
vpunpcklqdq %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
vpunpckhqdq %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
vpunpcklqdq %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
vpunpckhqdq %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
vpunpcklqdq %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
vpunpckhqdq %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
vpunpcklqdq %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
vpunpckhqdq %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
vpermq $0x4e, 0x360(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
vpermq $0x4e, 0x380(%rsi), %ymm7 # ymm7 = mem[2,3,0,1]
vpsubw %ymm9, %ymm3, %ymm12
vpaddw %ymm3, %ymm9, %ymm9
vpsubw %ymm10, %ymm5, %ymm13
vpmullw %ymm2, %ymm12, %ymm3
vpaddw %ymm5, %ymm10, %ymm10
vpsubw %ymm6, %ymm8, %ymm14
vpmullw %ymm2, %ymm13, %ymm5
vpaddw %ymm8, %ymm6, %ymm6
vpsubw %ymm4, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm8
vpaddw %ymm11, %ymm4, %ymm4
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm7, %ymm12, %ymm12
vpmulhw %ymm7, %ymm13, %ymm13
vpmulhw %ymm7, %ymm14, %ymm14
vpmulhw %ymm7, %ymm15, %ymm15
vpmulhw %ymm0, %ymm3, %ymm3
vpmulhw %ymm0, %ymm5, %ymm5
vpmulhw %ymm0, %ymm8, %ymm8
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm3, %ymm12, %ymm3
vpsubw %ymm5, %ymm13, %ymm5
vpsubw %ymm8, %ymm14, %ymm8
vpsubw %ymm11, %ymm15, %ymm11
vpmulhw %ymm1, %ymm9, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm9, %ymm9
vperm2i128 $0x20, %ymm10, %ymm9, %ymm7 # ymm7 = ymm9[0,1],ymm10[0,1]
vperm2i128 $0x31, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[2,3],ymm10[2,3]
vperm2i128 $0x20, %ymm4, %ymm6, %ymm9 # ymm9 = ymm6[0,1],ymm4[0,1]
vperm2i128 $0x31, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[2,3],ymm4[2,3]
vperm2i128 $0x20, %ymm5, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm5[0,1]
vperm2i128 $0x31, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[2,3],ymm5[2,3]
vperm2i128 $0x20, %ymm11, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm11[0,1]
vperm2i128 $0x31, %ymm11, %ymm8, %ymm11 # ymm11 = ymm8[2,3],ymm11[2,3]
vmovdqa 0x320(%rsi), %ymm2
vmovdqa 0x340(%rsi), %ymm8
vpsubw %ymm7, %ymm10, %ymm12
vpaddw %ymm10, %ymm7, %ymm7
vpsubw %ymm9, %ymm4, %ymm13
vpmullw %ymm2, %ymm12, %ymm10
vpaddw %ymm4, %ymm9, %ymm9
vpsubw %ymm6, %ymm5, %ymm14
vpmullw %ymm2, %ymm13, %ymm4
vpaddw %ymm5, %ymm6, %ymm6
vpsubw %ymm3, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm5
vpaddw %ymm11, %ymm3, %ymm3
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm8, %ymm12, %ymm12
vpmulhw %ymm8, %ymm13, %ymm13
vpmulhw %ymm8, %ymm14, %ymm14
vpmulhw %ymm8, %ymm15, %ymm15
vpmulhw %ymm0, %ymm10, %ymm10
vpmulhw %ymm0, %ymm4, %ymm4
vpmulhw %ymm0, %ymm5, %ymm5
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm10, %ymm12, %ymm10
vpsubw %ymm4, %ymm13, %ymm4
vpsubw %ymm5, %ymm14, %ymm5
vpsubw %ymm11, %ymm15, %ymm11
vpmulhw %ymm1, %ymm7, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm7, %ymm7
vmovdqa %ymm7, (%rdi)
vmovdqa %ymm9, 0x20(%rdi)
vmovdqa %ymm6, 0x40(%rdi)
vmovdqa %ymm3, 0x60(%rdi)
vmovdqa %ymm10, 0x80(%rdi)
vmovdqa %ymm4, 0xa0(%rdi)
vmovdqa %ymm5, 0xc0(%rdi)
vmovdqa %ymm11, 0xe0(%rdi)
vmovdqa 0x60(%rsi), %ymm2
vmovdqa 0x80(%rsi), %ymm3
vmovdqa 0x100(%rdi), %ymm4
vmovdqa 0x140(%rdi), %ymm6
vmovdqa 0x120(%rdi), %ymm5
vmovdqa 0x160(%rdi), %ymm7
vpmullw %ymm2, %ymm4, %ymm12
vpmulhw %ymm3, %ymm4, %ymm4
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm4, %ymm4
vpmullw %ymm2, %ymm6, %ymm12
vpmulhw %ymm3, %ymm6, %ymm6
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm6, %ymm6
vpmullw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm5, %ymm5
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm5, %ymm5
vpmullw %ymm2, %ymm7, %ymm12
vpmulhw %ymm3, %ymm7, %ymm7
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm7, %ymm7
vmovdqa 0x180(%rdi), %ymm8
vmovdqa 0x1c0(%rdi), %ymm10
vmovdqa 0x1a0(%rdi), %ymm9
vmovdqa 0x1e0(%rdi), %ymm11
vpmullw %ymm2, %ymm8, %ymm12
vpmulhw %ymm3, %ymm8, %ymm8
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm8, %ymm8
vpmullw %ymm2, %ymm10, %ymm12
vpmulhw %ymm3, %ymm10, %ymm10
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm10, %ymm10
vpmullw %ymm2, %ymm9, %ymm12
vpmulhw %ymm3, %ymm9, %ymm9
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm9, %ymm9
vpmullw %ymm2, %ymm11, %ymm12
vpmulhw %ymm3, %ymm11, %ymm11
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm11, %ymm11
vpermq $0x4e, 0x2e0(%rsi), %ymm15 # ymm15 = mem[2,3,0,1]
vpermq $0x4e, 0x2a0(%rsi), %ymm1 # ymm1 = mem[2,3,0,1]
vpermq $0x4e, 0x300(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
vpermq $0x4e, 0x2c0(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
vmovdqa 0x100(%rsi), %ymm12
vpshufb %ymm12, %ymm15, %ymm15
vpshufb %ymm12, %ymm1, %ymm1
vpshufb %ymm12, %ymm2, %ymm2
vpshufb %ymm12, %ymm3, %ymm3
vpsubw %ymm4, %ymm6, %ymm12
vpaddw %ymm6, %ymm4, %ymm4
vpsubw %ymm5, %ymm7, %ymm13
vpmullw %ymm15, %ymm12, %ymm6
vpaddw %ymm7, %ymm5, %ymm5
vpsubw %ymm8, %ymm10, %ymm14
vpmullw %ymm15, %ymm13, %ymm7
vpaddw %ymm10, %ymm8, %ymm8
vpsubw %ymm9, %ymm11, %ymm15
vpmullw %ymm1, %ymm14, %ymm10
vpaddw %ymm11, %ymm9, %ymm9
vpmullw %ymm1, %ymm15, %ymm11
vpmulhw %ymm2, %ymm12, %ymm12
vpmulhw %ymm2, %ymm13, %ymm13
vpmulhw %ymm3, %ymm14, %ymm14
vpmulhw %ymm3, %ymm15, %ymm15
vpmulhw %ymm0, %ymm6, %ymm6
vpmulhw %ymm0, %ymm7, %ymm7
vpmulhw %ymm0, %ymm10, %ymm10
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm6, %ymm12, %ymm6
vpsubw %ymm7, %ymm13, %ymm7
vpsubw %ymm10, %ymm14, %ymm10
vpsubw %ymm11, %ymm15, %ymm11
vpermq $0x4e, 0x260(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
vpermq $0x4e, 0x280(%rsi), %ymm3 # ymm3 = mem[2,3,0,1]
vmovdqa 0x100(%rsi), %ymm1
vpshufb %ymm1, %ymm2, %ymm2
vpshufb %ymm1, %ymm3, %ymm3
vpsubw %ymm4, %ymm8, %ymm12
vpaddw %ymm8, %ymm4, %ymm4
vpsubw %ymm5, %ymm9, %ymm13
vpmullw %ymm2, %ymm12, %ymm8
vpaddw %ymm9, %ymm5, %ymm5
vpsubw %ymm6, %ymm10, %ymm14
vpmullw %ymm2, %ymm13, %ymm9
vpaddw %ymm10, %ymm6, %ymm6
vpsubw %ymm7, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm10
vpaddw %ymm11, %ymm7, %ymm7
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm3, %ymm12, %ymm12
vpmulhw %ymm3, %ymm13, %ymm13
vpmulhw %ymm3, %ymm14, %ymm14
vpmulhw %ymm3, %ymm15, %ymm15
vpmulhw %ymm0, %ymm8, %ymm8
vpmulhw %ymm0, %ymm9, %ymm9
vpmulhw %ymm0, %ymm10, %ymm10
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm8, %ymm12, %ymm8
vpsubw %ymm9, %ymm13, %ymm9
vpsubw %ymm10, %ymm14, %ymm10
vpsubw %ymm11, %ymm15, %ymm11
vpslld $0x10, %ymm5, %ymm3
vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
vpsrld $0x10, %ymm4, %ymm4
vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
vpslld $0x10, %ymm7, %ymm4
vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
vpsrld $0x10, %ymm6, %ymm6
vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
vpslld $0x10, %ymm9, %ymm6
vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
vpsrld $0x10, %ymm8, %ymm8
vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
vpslld $0x10, %ymm11, %ymm8
vpblendw $0xaa, %ymm8, %ymm10, %ymm8 # ymm8 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4],ymm8[5],ymm10[6],ymm8[7],ymm10[8],ymm8[9],ymm10[10],ymm8[11],ymm10[12],ymm8[13],ymm10[14],ymm8[15]
vpsrld $0x10, %ymm10, %ymm10
vpblendw $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7],ymm10[8],ymm11[9],ymm10[10],ymm11[11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
vmovdqa 0x120(%rsi), %ymm12
vpermd 0x220(%rsi), %ymm12, %ymm2
vpermd 0x240(%rsi), %ymm12, %ymm10
vpsubw %ymm3, %ymm5, %ymm12
vpaddw %ymm5, %ymm3, %ymm3
vpsubw %ymm4, %ymm7, %ymm13
vpmullw %ymm2, %ymm12, %ymm5
vpaddw %ymm7, %ymm4, %ymm4
vpsubw %ymm6, %ymm9, %ymm14
vpmullw %ymm2, %ymm13, %ymm7
vpaddw %ymm9, %ymm6, %ymm6
vpsubw %ymm8, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm9
vpaddw %ymm11, %ymm8, %ymm8
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm10, %ymm12, %ymm12
vpmulhw %ymm10, %ymm13, %ymm13
vpmulhw %ymm10, %ymm14, %ymm14
vpmulhw %ymm10, %ymm15, %ymm15
vpmulhw %ymm0, %ymm5, %ymm5
vpmulhw %ymm0, %ymm7, %ymm7
vpmulhw %ymm0, %ymm9, %ymm9
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm5, %ymm12, %ymm5
vpsubw %ymm7, %ymm13, %ymm7
vpsubw %ymm9, %ymm14, %ymm9
vpsubw %ymm11, %ymm15, %ymm11
vmovdqa 0x40(%rsi), %ymm1
vpmulhw %ymm1, %ymm3, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm3, %ymm3
vmovsldup %ymm4, %ymm10 # ymm10 = ymm4[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7]
vpsrlq $0x20, %ymm3, %ymm3
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
vmovsldup %ymm8, %ymm3 # ymm3 = ymm8[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
vpsrlq $0x20, %ymm6, %ymm6
vpblendd $0xaa, %ymm8, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm8[1],ymm6[2],ymm8[3],ymm6[4],ymm8[5],ymm6[6],ymm8[7]
vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
vpsrlq $0x20, %ymm5, %ymm5
vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
vmovsldup %ymm11, %ymm5 # ymm5 = ymm11[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
vpsrlq $0x20, %ymm9, %ymm9
vpblendd $0xaa, %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4],ymm11[5],ymm9[6],ymm11[7]
vpermq $0x1b, 0x1e0(%rsi), %ymm2 # ymm2 = mem[3,2,1,0]
vpermq $0x1b, 0x200(%rsi), %ymm9 # ymm9 = mem[3,2,1,0]
vpsubw %ymm10, %ymm4, %ymm12
vpaddw %ymm4, %ymm10, %ymm10
vpsubw %ymm3, %ymm8, %ymm13
vpmullw %ymm2, %ymm12, %ymm4
vpaddw %ymm8, %ymm3, %ymm3
vpsubw %ymm6, %ymm7, %ymm14
vpmullw %ymm2, %ymm13, %ymm8
vpaddw %ymm7, %ymm6, %ymm6
vpsubw %ymm5, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm7
vpaddw %ymm11, %ymm5, %ymm5
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm9, %ymm12, %ymm12
vpmulhw %ymm9, %ymm13, %ymm13
vpmulhw %ymm9, %ymm14, %ymm14
vpmulhw %ymm9, %ymm15, %ymm15
vpmulhw %ymm0, %ymm4, %ymm4
vpmulhw %ymm0, %ymm8, %ymm8
vpmulhw %ymm0, %ymm7, %ymm7
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm4, %ymm12, %ymm4
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm7, %ymm14, %ymm7
vpsubw %ymm11, %ymm15, %ymm11
vpmulhw %ymm1, %ymm10, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm10, %ymm10
vpunpcklqdq %ymm3, %ymm10, %ymm9 # ymm9 = ymm10[0],ymm3[0],ymm10[2],ymm3[2]
vpunpckhqdq %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
vpunpcklqdq %ymm5, %ymm6, %ymm10 # ymm10 = ymm6[0],ymm5[0],ymm6[2],ymm5[2]
vpunpckhqdq %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[1],ymm5[1],ymm6[3],ymm5[3]
vpunpcklqdq %ymm8, %ymm4, %ymm6 # ymm6 = ymm4[0],ymm8[0],ymm4[2],ymm8[2]
vpunpckhqdq %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[1],ymm8[1],ymm4[3],ymm8[3]
vpunpcklqdq %ymm11, %ymm7, %ymm4 # ymm4 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
vpunpckhqdq %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[1],ymm11[1],ymm7[3],ymm11[3]
vpermq $0x4e, 0x1a0(%rsi), %ymm2 # ymm2 = mem[2,3,0,1]
vpermq $0x4e, 0x1c0(%rsi), %ymm7 # ymm7 = mem[2,3,0,1]
vpsubw %ymm9, %ymm3, %ymm12
vpaddw %ymm3, %ymm9, %ymm9
vpsubw %ymm10, %ymm5, %ymm13
vpmullw %ymm2, %ymm12, %ymm3
vpaddw %ymm5, %ymm10, %ymm10
vpsubw %ymm6, %ymm8, %ymm14
vpmullw %ymm2, %ymm13, %ymm5
vpaddw %ymm8, %ymm6, %ymm6
vpsubw %ymm4, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm8
vpaddw %ymm11, %ymm4, %ymm4
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm7, %ymm12, %ymm12
vpmulhw %ymm7, %ymm13, %ymm13
vpmulhw %ymm7, %ymm14, %ymm14
vpmulhw %ymm7, %ymm15, %ymm15
vpmulhw %ymm0, %ymm3, %ymm3
vpmulhw %ymm0, %ymm5, %ymm5
vpmulhw %ymm0, %ymm8, %ymm8
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm3, %ymm12, %ymm3
vpsubw %ymm5, %ymm13, %ymm5
vpsubw %ymm8, %ymm14, %ymm8
vpsubw %ymm11, %ymm15, %ymm11
vpmulhw %ymm1, %ymm9, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm9, %ymm9
vperm2i128 $0x20, %ymm10, %ymm9, %ymm7 # ymm7 = ymm9[0,1],ymm10[0,1]
vperm2i128 $0x31, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[2,3],ymm10[2,3]
vperm2i128 $0x20, %ymm4, %ymm6, %ymm9 # ymm9 = ymm6[0,1],ymm4[0,1]
vperm2i128 $0x31, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[2,3],ymm4[2,3]
vperm2i128 $0x20, %ymm5, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm5[0,1]
vperm2i128 $0x31, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[2,3],ymm5[2,3]
vperm2i128 $0x20, %ymm11, %ymm8, %ymm3 # ymm3 = ymm8[0,1],ymm11[0,1]
vperm2i128 $0x31, %ymm11, %ymm8, %ymm11 # ymm11 = ymm8[2,3],ymm11[2,3]
vmovdqa 0x160(%rsi), %ymm2
vmovdqa 0x180(%rsi), %ymm8
vpsubw %ymm7, %ymm10, %ymm12
vpaddw %ymm10, %ymm7, %ymm7
vpsubw %ymm9, %ymm4, %ymm13
vpmullw %ymm2, %ymm12, %ymm10
vpaddw %ymm4, %ymm9, %ymm9
vpsubw %ymm6, %ymm5, %ymm14
vpmullw %ymm2, %ymm13, %ymm4
vpaddw %ymm5, %ymm6, %ymm6
vpsubw %ymm3, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm5
vpaddw %ymm11, %ymm3, %ymm3
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm8, %ymm12, %ymm12
vpmulhw %ymm8, %ymm13, %ymm13
vpmulhw %ymm8, %ymm14, %ymm14
vpmulhw %ymm8, %ymm15, %ymm15
vpmulhw %ymm0, %ymm10, %ymm10
vpmulhw %ymm0, %ymm4, %ymm4
vpmulhw %ymm0, %ymm5, %ymm5
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm10, %ymm12, %ymm10
vpsubw %ymm4, %ymm13, %ymm4
vpsubw %ymm5, %ymm14, %ymm5
vpsubw %ymm11, %ymm15, %ymm11
vpmulhw %ymm1, %ymm7, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm7, %ymm7
vmovdqa %ymm7, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa %ymm6, 0x140(%rdi)
vmovdqa %ymm3, 0x160(%rdi)
vmovdqa %ymm10, 0x180(%rdi)
vmovdqa %ymm4, 0x1a0(%rdi)
vmovdqa %ymm5, 0x1c0(%rdi)
vmovdqa %ymm11, 0x1e0(%rdi)
vmovdqa (%rdi), %ymm4
vmovdqa 0x100(%rdi), %ymm8
vmovdqa 0x20(%rdi), %ymm5
vmovdqa 0x120(%rdi), %ymm9
vpbroadcastq 0x140(%rsi), %ymm2
vmovdqa 0x40(%rdi), %ymm6
vmovdqa 0x140(%rdi), %ymm10
vmovdqa 0x60(%rdi), %ymm7
vmovdqa 0x160(%rdi), %ymm11
vpbroadcastq 0x148(%rsi), %ymm3
vpsubw %ymm4, %ymm8, %ymm12
vpaddw %ymm8, %ymm4, %ymm4
vpsubw %ymm5, %ymm9, %ymm13
vpmullw %ymm2, %ymm12, %ymm8
vpaddw %ymm9, %ymm5, %ymm5
vpsubw %ymm6, %ymm10, %ymm14
vpmullw %ymm2, %ymm13, %ymm9
vpaddw %ymm10, %ymm6, %ymm6
vpsubw %ymm7, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm10
vpaddw %ymm11, %ymm7, %ymm7
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm3, %ymm12, %ymm12
vpmulhw %ymm3, %ymm13, %ymm13
vpmulhw %ymm3, %ymm14, %ymm14
vpmulhw %ymm3, %ymm15, %ymm15
vpmulhw %ymm0, %ymm8, %ymm8
vpmulhw %ymm0, %ymm9, %ymm9
vpmulhw %ymm0, %ymm10, %ymm10
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm8, %ymm12, %ymm8
vpsubw %ymm9, %ymm13, %ymm9
vpsubw %ymm10, %ymm14, %ymm10
vpsubw %ymm11, %ymm15, %ymm11
vmovdqa %ymm4, (%rdi)
vmovdqa %ymm5, 0x20(%rdi)
vmovdqa %ymm6, 0x40(%rdi)
vmovdqa %ymm7, 0x60(%rdi)
vmovdqa %ymm8, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa %ymm10, 0x140(%rdi)
vmovdqa %ymm11, 0x160(%rdi)
vmovdqa 0x80(%rdi), %ymm4
vmovdqa 0x180(%rdi), %ymm8
vmovdqa 0xa0(%rdi), %ymm5
vmovdqa 0x1a0(%rdi), %ymm9
vpbroadcastq 0x140(%rsi), %ymm2
vmovdqa 0xc0(%rdi), %ymm6
vmovdqa 0x1c0(%rdi), %ymm10
vmovdqa 0xe0(%rdi), %ymm7
vmovdqa 0x1e0(%rdi), %ymm11
vpbroadcastq 0x148(%rsi), %ymm3
vpsubw %ymm4, %ymm8, %ymm12
vpaddw %ymm8, %ymm4, %ymm4
vpsubw %ymm5, %ymm9, %ymm13
vpmullw %ymm2, %ymm12, %ymm8
vpaddw %ymm9, %ymm5, %ymm5
vpsubw %ymm6, %ymm10, %ymm14
vpmullw %ymm2, %ymm13, %ymm9
vpaddw %ymm10, %ymm6, %ymm6
vpsubw %ymm7, %ymm11, %ymm15
vpmullw %ymm2, %ymm14, %ymm10
vpaddw %ymm11, %ymm7, %ymm7
vpmullw %ymm2, %ymm15, %ymm11
vpmulhw %ymm3, %ymm12, %ymm12
vpmulhw %ymm3, %ymm13, %ymm13
vpmulhw %ymm3, %ymm14, %ymm14
vpmulhw %ymm3, %ymm15, %ymm15
vpmulhw %ymm0, %ymm8, %ymm8
vpmulhw %ymm0, %ymm9, %ymm9
vpmulhw %ymm0, %ymm10, %ymm10
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm8, %ymm12, %ymm8
vpsubw %ymm9, %ymm13, %ymm9
vpsubw %ymm10, %ymm14, %ymm10
vpsubw %ymm11, %ymm15, %ymm11
vmovdqa %ymm4, 0x80(%rdi)
vmovdqa %ymm5, 0xa0(%rdi)
vmovdqa %ymm6, 0xc0(%rdi)
vmovdqa %ymm7, 0xe0(%rdi)
vmovdqa %ymm8, 0x180(%rdi)
vmovdqa %ymm9, 0x1a0(%rdi)
vmovdqa %ymm10, 0x1c0(%rdi)
vmovdqa %ymm11, 0x1e0(%rdi)
retq
.cfi_endproc

View File

@@ -0,0 +1,80 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/mulcache_compute.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_poly_mulcache_compute_avx2)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_poly_mulcache_compute_avx2)
S2N_BN_SYMBOL(mlkem_poly_mulcache_compute_avx2):
.cfi_startproc
vmovdqa (%rdx), %ymm0
vmovdqa 0x20(%rsi), %ymm2
vmovdqa 0x60(%rsi), %ymm3
vmovdqa 0x500(%rdx), %ymm4
vmovdqa 0x580(%rdx), %ymm1
vpmullw %ymm2, %ymm1, %ymm5
vpmullw %ymm3, %ymm1, %ymm6
vpmulhw %ymm2, %ymm4, %ymm7
vpmulhw %ymm3, %ymm4, %ymm8
vpmulhw %ymm5, %ymm0, %ymm9
vpmulhw %ymm6, %ymm0, %ymm10
vpsubw %ymm9, %ymm7, %ymm7
vpsubw %ymm10, %ymm8, %ymm8
vmovdqa %ymm7, (%rdi)
vmovdqa %ymm8, 0x20(%rdi)
vmovdqa 0xa0(%rsi), %ymm2
vmovdqa 0xe0(%rsi), %ymm3
vmovdqa 0x520(%rdx), %ymm4
vmovdqa 0x5a0(%rdx), %ymm1
vpmullw %ymm2, %ymm1, %ymm5
vpmullw %ymm3, %ymm1, %ymm6
vpmulhw %ymm2, %ymm4, %ymm7
vpmulhw %ymm3, %ymm4, %ymm8
vpmulhw %ymm5, %ymm0, %ymm9
vpmulhw %ymm6, %ymm0, %ymm10
vpsubw %ymm9, %ymm7, %ymm7
vpsubw %ymm10, %ymm8, %ymm8
vmovdqa %ymm7, 0x40(%rdi)
vmovdqa %ymm8, 0x60(%rdi)
vmovdqa 0x120(%rsi), %ymm2
vmovdqa 0x160(%rsi), %ymm3
vmovdqa 0x540(%rdx), %ymm4
vmovdqa 0x5c0(%rdx), %ymm1
vpmullw %ymm2, %ymm1, %ymm5
vpmullw %ymm3, %ymm1, %ymm6
vpmulhw %ymm2, %ymm4, %ymm7
vpmulhw %ymm3, %ymm4, %ymm8
vpmulhw %ymm5, %ymm0, %ymm9
vpmulhw %ymm6, %ymm0, %ymm10
vpsubw %ymm9, %ymm7, %ymm7
vpsubw %ymm10, %ymm8, %ymm8
vmovdqa %ymm7, 0x80(%rdi)
vmovdqa %ymm8, 0xa0(%rdi)
vmovdqa 0x1a0(%rsi), %ymm2
vmovdqa 0x1e0(%rsi), %ymm3
vmovdqa 0x560(%rdx), %ymm4
vmovdqa 0x5e0(%rdx), %ymm1
vpmullw %ymm2, %ymm1, %ymm5
vpmullw %ymm3, %ymm1, %ymm6
vpmulhw %ymm2, %ymm4, %ymm7
vpmulhw %ymm3, %ymm4, %ymm8
vpmulhw %ymm5, %ymm0, %ymm9
vpmulhw %ymm6, %ymm0, %ymm10
vpsubw %ymm9, %ymm7, %ymm7
vpsubw %ymm10, %ymm8, %ymm8
vmovdqa %ymm7, 0xc0(%rdi)
vmovdqa %ymm8, 0xe0(%rdi)
retq
.cfi_endproc

View File

@@ -0,0 +1,629 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [AVX2_NTT]
* Faster AVX2 optimized NTT multiplication for Ring-LWE lattice cryptography.
* Gregor Seiler
* https://eprint.iacr.org/2018/039
*
* - [REF_AVX2]
* CRYSTALS-Kyber optimized AVX2 implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2].
*
* The core ideas behind the implementation are described in @[AVX2_NTT].
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/ntt.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_ntt_avx2)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_ntt_avx2)
S2N_BN_SYMBOL(mlkem_ntt_avx2):
.cfi_startproc
vmovdqa (%rsi), %ymm0
vpbroadcastq 0x140(%rsi), %ymm15
vmovdqa 0x100(%rdi), %ymm8
vmovdqa 0x120(%rdi), %ymm9
vmovdqa 0x140(%rdi), %ymm10
vmovdqa 0x160(%rdi), %ymm11
vpbroadcastq 0x148(%rsi), %ymm2
vpmullw %ymm15, %ymm8, %ymm12
vpmullw %ymm15, %ymm9, %ymm13
vpmullw %ymm15, %ymm10, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm8, %ymm8
vpmulhw %ymm2, %ymm9, %ymm9
vpmulhw %ymm2, %ymm10, %ymm10
vpmulhw %ymm2, %ymm11, %ymm11
vmovdqa (%rdi), %ymm4
vmovdqa 0x20(%rdi), %ymm5
vmovdqa 0x40(%rdi), %ymm6
vmovdqa 0x60(%rdi), %ymm7
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm8, %ymm4, %ymm3
vpsubw %ymm8, %ymm4, %ymm8
vpaddw %ymm9, %ymm5, %ymm4
vpsubw %ymm9, %ymm5, %ymm9
vpaddw %ymm10, %ymm6, %ymm5
vpsubw %ymm10, %ymm6, %ymm10
vpaddw %ymm11, %ymm7, %ymm6
vpsubw %ymm11, %ymm7, %ymm11
vpsubw %ymm12, %ymm3, %ymm3
vpaddw %ymm12, %ymm8, %ymm8
vpsubw %ymm13, %ymm4, %ymm4
vpaddw %ymm13, %ymm9, %ymm9
vpsubw %ymm14, %ymm5, %ymm5
vpaddw %ymm14, %ymm10, %ymm10
vpsubw %ymm15, %ymm6, %ymm6
vpaddw %ymm15, %ymm11, %ymm11
vmovdqa %ymm3, (%rdi)
vmovdqa %ymm4, 0x20(%rdi)
vmovdqa %ymm5, 0x40(%rdi)
vmovdqa %ymm6, 0x60(%rdi)
vmovdqa %ymm8, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa %ymm10, 0x140(%rdi)
vmovdqa %ymm11, 0x160(%rdi)
vpbroadcastq 0x140(%rsi), %ymm15
vmovdqa 0x180(%rdi), %ymm8
vmovdqa 0x1a0(%rdi), %ymm9
vmovdqa 0x1c0(%rdi), %ymm10
vmovdqa 0x1e0(%rdi), %ymm11
vpbroadcastq 0x148(%rsi), %ymm2
vpmullw %ymm15, %ymm8, %ymm12
vpmullw %ymm15, %ymm9, %ymm13
vpmullw %ymm15, %ymm10, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm8, %ymm8
vpmulhw %ymm2, %ymm9, %ymm9
vpmulhw %ymm2, %ymm10, %ymm10
vpmulhw %ymm2, %ymm11, %ymm11
vmovdqa 0x80(%rdi), %ymm4
vmovdqa 0xa0(%rdi), %ymm5
vmovdqa 0xc0(%rdi), %ymm6
vmovdqa 0xe0(%rdi), %ymm7
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm8, %ymm4, %ymm3
vpsubw %ymm8, %ymm4, %ymm8
vpaddw %ymm9, %ymm5, %ymm4
vpsubw %ymm9, %ymm5, %ymm9
vpaddw %ymm10, %ymm6, %ymm5
vpsubw %ymm10, %ymm6, %ymm10
vpaddw %ymm11, %ymm7, %ymm6
vpsubw %ymm11, %ymm7, %ymm11
vpsubw %ymm12, %ymm3, %ymm3
vpaddw %ymm12, %ymm8, %ymm8
vpsubw %ymm13, %ymm4, %ymm4
vpaddw %ymm13, %ymm9, %ymm9
vpsubw %ymm14, %ymm5, %ymm5
vpaddw %ymm14, %ymm10, %ymm10
vpsubw %ymm15, %ymm6, %ymm6
vpaddw %ymm15, %ymm11, %ymm11
vmovdqa %ymm3, 0x80(%rdi)
vmovdqa %ymm4, 0xa0(%rdi)
vmovdqa %ymm5, 0xc0(%rdi)
vmovdqa %ymm6, 0xe0(%rdi)
vmovdqa %ymm8, 0x180(%rdi)
vmovdqa %ymm9, 0x1a0(%rdi)
vmovdqa %ymm10, 0x1c0(%rdi)
vmovdqa %ymm11, 0x1e0(%rdi)
vmovdqa 0x160(%rsi), %ymm15
vmovdqa 0x80(%rdi), %ymm8
vmovdqa 0xa0(%rdi), %ymm9
vmovdqa 0xc0(%rdi), %ymm10
vmovdqa 0xe0(%rdi), %ymm11
vmovdqa 0x180(%rsi), %ymm2
vpmullw %ymm15, %ymm8, %ymm12
vpmullw %ymm15, %ymm9, %ymm13
vpmullw %ymm15, %ymm10, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm8, %ymm8
vpmulhw %ymm2, %ymm9, %ymm9
vpmulhw %ymm2, %ymm10, %ymm10
vpmulhw %ymm2, %ymm11, %ymm11
vmovdqa (%rdi), %ymm4
vmovdqa 0x20(%rdi), %ymm5
vmovdqa 0x40(%rdi), %ymm6
vmovdqa 0x60(%rdi), %ymm7
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm8, %ymm4, %ymm3
vpsubw %ymm8, %ymm4, %ymm8
vpaddw %ymm9, %ymm5, %ymm4
vpsubw %ymm9, %ymm5, %ymm9
vpaddw %ymm10, %ymm6, %ymm5
vpsubw %ymm10, %ymm6, %ymm10
vpaddw %ymm11, %ymm7, %ymm6
vpsubw %ymm11, %ymm7, %ymm11
vpsubw %ymm12, %ymm3, %ymm3
vpaddw %ymm12, %ymm8, %ymm8
vpsubw %ymm13, %ymm4, %ymm4
vpaddw %ymm13, %ymm9, %ymm9
vpsubw %ymm14, %ymm5, %ymm5
vpaddw %ymm14, %ymm10, %ymm10
vpsubw %ymm15, %ymm6, %ymm6
vpaddw %ymm15, %ymm11, %ymm11
vperm2i128 $0x20, %ymm10, %ymm5, %ymm7 # ymm7 = ymm5[0,1],ymm10[0,1]
vperm2i128 $0x31, %ymm10, %ymm5, %ymm10 # ymm10 = ymm5[2,3],ymm10[2,3]
vperm2i128 $0x20, %ymm11, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm11[0,1]
vperm2i128 $0x31, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[2,3],ymm11[2,3]
vmovdqa 0x1a0(%rsi), %ymm15
vmovdqa 0x1c0(%rsi), %ymm2
vpmullw %ymm15, %ymm7, %ymm12
vpmullw %ymm15, %ymm10, %ymm13
vpmullw %ymm15, %ymm5, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm7, %ymm7
vpmulhw %ymm2, %ymm10, %ymm10
vpmulhw %ymm2, %ymm5, %ymm5
vpmulhw %ymm2, %ymm11, %ymm11
vperm2i128 $0x20, %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm8[0,1]
vperm2i128 $0x31, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[2,3],ymm8[2,3]
vperm2i128 $0x20, %ymm9, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm9[0,1]
vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm7, %ymm6, %ymm4
vpsubw %ymm7, %ymm6, %ymm7
vpaddw %ymm10, %ymm8, %ymm6
vpsubw %ymm10, %ymm8, %ymm10
vpaddw %ymm5, %ymm3, %ymm8
vpsubw %ymm5, %ymm3, %ymm5
vpaddw %ymm11, %ymm9, %ymm3
vpsubw %ymm11, %ymm9, %ymm11
vpsubw %ymm12, %ymm4, %ymm4
vpaddw %ymm12, %ymm7, %ymm7
vpsubw %ymm13, %ymm6, %ymm6
vpaddw %ymm13, %ymm10, %ymm10
vpsubw %ymm14, %ymm8, %ymm8
vpaddw %ymm14, %ymm5, %ymm5
vpsubw %ymm15, %ymm3, %ymm3
vpaddw %ymm15, %ymm11, %ymm11
vpunpcklqdq %ymm5, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm5[0],ymm8[2],ymm5[2]
vpunpckhqdq %ymm5, %ymm8, %ymm5 # ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3]
vpunpcklqdq %ymm11, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
vpunpckhqdq %ymm11, %ymm3, %ymm11 # ymm11 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
vmovdqa 0x1e0(%rsi), %ymm15
vmovdqa 0x200(%rsi), %ymm2
vpmullw %ymm15, %ymm9, %ymm12
vpmullw %ymm15, %ymm5, %ymm13
vpmullw %ymm15, %ymm8, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm9, %ymm9
vpmulhw %ymm2, %ymm5, %ymm5
vpmulhw %ymm2, %ymm8, %ymm8
vpmulhw %ymm2, %ymm11, %ymm11
vpunpcklqdq %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
vpunpckhqdq %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
vpunpcklqdq %ymm10, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
vpunpckhqdq %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm9, %ymm3, %ymm6
vpsubw %ymm9, %ymm3, %ymm9
vpaddw %ymm5, %ymm7, %ymm3
vpsubw %ymm5, %ymm7, %ymm5
vpaddw %ymm8, %ymm4, %ymm7
vpsubw %ymm8, %ymm4, %ymm8
vpaddw %ymm11, %ymm10, %ymm4
vpsubw %ymm11, %ymm10, %ymm11
vpsubw %ymm12, %ymm6, %ymm6
vpaddw %ymm12, %ymm9, %ymm9
vpsubw %ymm13, %ymm3, %ymm3
vpaddw %ymm13, %ymm5, %ymm5
vpsubw %ymm14, %ymm7, %ymm7
vpaddw %ymm14, %ymm8, %ymm8
vpsubw %ymm15, %ymm4, %ymm4
vpaddw %ymm15, %ymm11, %ymm11
vmovsldup %ymm8, %ymm10 # ymm10 = ymm8[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm10, %ymm7, %ymm10 # ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4],ymm10[5],ymm7[6],ymm10[7]
vpsrlq $0x20, %ymm7, %ymm7
vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
vmovsldup %ymm11, %ymm7 # ymm7 = ymm11[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7]
vpsrlq $0x20, %ymm4, %ymm4
vpblendd $0xaa, %ymm11, %ymm4, %ymm11 # ymm11 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4],ymm11[5],ymm4[6],ymm11[7]
vmovdqa 0x220(%rsi), %ymm15
vmovdqa 0x240(%rsi), %ymm2
vpmullw %ymm15, %ymm10, %ymm12
vpmullw %ymm15, %ymm8, %ymm13
vpmullw %ymm15, %ymm7, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm10, %ymm10
vpmulhw %ymm2, %ymm8, %ymm8
vpmulhw %ymm2, %ymm7, %ymm7
vpmulhw %ymm2, %ymm11, %ymm11
vmovsldup %ymm9, %ymm4 # ymm4 = ymm9[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
vpsrlq $0x20, %ymm6, %ymm6
vpblendd $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7]
vmovsldup %ymm5, %ymm6 # ymm6 = ymm5[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
vpsrlq $0x20, %ymm3, %ymm3
vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm10, %ymm4, %ymm3
vpsubw %ymm10, %ymm4, %ymm10
vpaddw %ymm8, %ymm9, %ymm4
vpsubw %ymm8, %ymm9, %ymm8
vpaddw %ymm7, %ymm6, %ymm9
vpsubw %ymm7, %ymm6, %ymm7
vpaddw %ymm11, %ymm5, %ymm6
vpsubw %ymm11, %ymm5, %ymm11
vpsubw %ymm12, %ymm3, %ymm3
vpaddw %ymm12, %ymm10, %ymm10
vpsubw %ymm13, %ymm4, %ymm4
vpaddw %ymm13, %ymm8, %ymm8
vpsubw %ymm14, %ymm9, %ymm9
vpaddw %ymm14, %ymm7, %ymm7
vpsubw %ymm15, %ymm6, %ymm6
vpaddw %ymm15, %ymm11, %ymm11
vpslld $0x10, %ymm7, %ymm5
vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
vpsrld $0x10, %ymm9, %ymm9
vpblendw $0xaa, %ymm7, %ymm9, %ymm7 # ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4],ymm7[5],ymm9[6],ymm7[7],ymm9[8],ymm7[9],ymm9[10],ymm7[11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
vpslld $0x10, %ymm11, %ymm9
vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
vpsrld $0x10, %ymm6, %ymm6
vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
vmovdqa 0x260(%rsi), %ymm15
vmovdqa 0x280(%rsi), %ymm2
vpmullw %ymm15, %ymm5, %ymm12
vpmullw %ymm15, %ymm7, %ymm13
vpmullw %ymm15, %ymm9, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm5, %ymm5
vpmulhw %ymm2, %ymm7, %ymm7
vpmulhw %ymm2, %ymm9, %ymm9
vpmulhw %ymm2, %ymm11, %ymm11
vpslld $0x10, %ymm10, %ymm6
vpblendw $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7],ymm3[8],ymm6[9],ymm3[10],ymm6[11],ymm3[12],ymm6[13],ymm3[14],ymm6[15]
vpsrld $0x10, %ymm3, %ymm3
vpblendw $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7],ymm3[8],ymm10[9],ymm3[10],ymm10[11],ymm3[12],ymm10[13],ymm3[14],ymm10[15]
vpslld $0x10, %ymm8, %ymm3
vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
vpsrld $0x10, %ymm4, %ymm4
vpblendw $0xaa, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7],ymm4[8],ymm8[9],ymm4[10],ymm8[11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm5, %ymm6, %ymm4
vpsubw %ymm5, %ymm6, %ymm5
vpaddw %ymm7, %ymm10, %ymm6
vpsubw %ymm7, %ymm10, %ymm7
vpaddw %ymm9, %ymm3, %ymm10
vpsubw %ymm9, %ymm3, %ymm9
vpaddw %ymm11, %ymm8, %ymm3
vpsubw %ymm11, %ymm8, %ymm11
vpsubw %ymm12, %ymm4, %ymm4
vpaddw %ymm12, %ymm5, %ymm5
vpsubw %ymm13, %ymm6, %ymm6
vpaddw %ymm13, %ymm7, %ymm7
vpsubw %ymm14, %ymm10, %ymm10
vpaddw %ymm14, %ymm9, %ymm9
vpsubw %ymm15, %ymm3, %ymm3
vpaddw %ymm15, %ymm11, %ymm11
vmovdqa 0x2a0(%rsi), %ymm14
vmovdqa 0x2e0(%rsi), %ymm15
vmovdqa 0x2c0(%rsi), %ymm8
vmovdqa 0x300(%rsi), %ymm2
vpmullw %ymm14, %ymm10, %ymm12
vpmullw %ymm14, %ymm3, %ymm13
vpmullw %ymm15, %ymm9, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm8, %ymm10, %ymm10
vpmulhw %ymm8, %ymm3, %ymm3
vpmulhw %ymm2, %ymm9, %ymm9
vpmulhw %ymm2, %ymm11, %ymm11
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm10, %ymm4, %ymm8
vpsubw %ymm10, %ymm4, %ymm10
vpaddw %ymm3, %ymm6, %ymm4
vpsubw %ymm3, %ymm6, %ymm3
vpaddw %ymm9, %ymm5, %ymm6
vpsubw %ymm9, %ymm5, %ymm9
vpaddw %ymm11, %ymm7, %ymm5
vpsubw %ymm11, %ymm7, %ymm11
vpsubw %ymm12, %ymm8, %ymm8
vpaddw %ymm12, %ymm10, %ymm10
vpsubw %ymm13, %ymm4, %ymm4
vpaddw %ymm13, %ymm3, %ymm3
vpsubw %ymm14, %ymm6, %ymm6
vpaddw %ymm14, %ymm9, %ymm9
vpsubw %ymm15, %ymm5, %ymm5
vpaddw %ymm15, %ymm11, %ymm11
vmovdqa %ymm8, (%rdi)
vmovdqa %ymm4, 0x20(%rdi)
vmovdqa %ymm10, 0x40(%rdi)
vmovdqa %ymm3, 0x60(%rdi)
vmovdqa %ymm6, 0x80(%rdi)
vmovdqa %ymm5, 0xa0(%rdi)
vmovdqa %ymm9, 0xc0(%rdi)
vmovdqa %ymm11, 0xe0(%rdi)
vmovdqa 0x320(%rsi), %ymm15
vmovdqa 0x180(%rdi), %ymm8
vmovdqa 0x1a0(%rdi), %ymm9
vmovdqa 0x1c0(%rdi), %ymm10
vmovdqa 0x1e0(%rdi), %ymm11
vmovdqa 0x340(%rsi), %ymm2
vpmullw %ymm15, %ymm8, %ymm12
vpmullw %ymm15, %ymm9, %ymm13
vpmullw %ymm15, %ymm10, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm8, %ymm8
vpmulhw %ymm2, %ymm9, %ymm9
vpmulhw %ymm2, %ymm10, %ymm10
vpmulhw %ymm2, %ymm11, %ymm11
vmovdqa 0x100(%rdi), %ymm4
vmovdqa 0x120(%rdi), %ymm5
vmovdqa 0x140(%rdi), %ymm6
vmovdqa 0x160(%rdi), %ymm7
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm8, %ymm4, %ymm3
vpsubw %ymm8, %ymm4, %ymm8
vpaddw %ymm9, %ymm5, %ymm4
vpsubw %ymm9, %ymm5, %ymm9
vpaddw %ymm10, %ymm6, %ymm5
vpsubw %ymm10, %ymm6, %ymm10
vpaddw %ymm11, %ymm7, %ymm6
vpsubw %ymm11, %ymm7, %ymm11
vpsubw %ymm12, %ymm3, %ymm3
vpaddw %ymm12, %ymm8, %ymm8
vpsubw %ymm13, %ymm4, %ymm4
vpaddw %ymm13, %ymm9, %ymm9
vpsubw %ymm14, %ymm5, %ymm5
vpaddw %ymm14, %ymm10, %ymm10
vpsubw %ymm15, %ymm6, %ymm6
vpaddw %ymm15, %ymm11, %ymm11
vperm2i128 $0x20, %ymm10, %ymm5, %ymm7 # ymm7 = ymm5[0,1],ymm10[0,1]
vperm2i128 $0x31, %ymm10, %ymm5, %ymm10 # ymm10 = ymm5[2,3],ymm10[2,3]
vperm2i128 $0x20, %ymm11, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm11[0,1]
vperm2i128 $0x31, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[2,3],ymm11[2,3]
vmovdqa 0x360(%rsi), %ymm15
vmovdqa 0x380(%rsi), %ymm2
vpmullw %ymm15, %ymm7, %ymm12
vpmullw %ymm15, %ymm10, %ymm13
vpmullw %ymm15, %ymm5, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm7, %ymm7
vpmulhw %ymm2, %ymm10, %ymm10
vpmulhw %ymm2, %ymm5, %ymm5
vpmulhw %ymm2, %ymm11, %ymm11
vperm2i128 $0x20, %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0,1],ymm8[0,1]
vperm2i128 $0x31, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[2,3],ymm8[2,3]
vperm2i128 $0x20, %ymm9, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm9[0,1]
vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm7, %ymm6, %ymm4
vpsubw %ymm7, %ymm6, %ymm7
vpaddw %ymm10, %ymm8, %ymm6
vpsubw %ymm10, %ymm8, %ymm10
vpaddw %ymm5, %ymm3, %ymm8
vpsubw %ymm5, %ymm3, %ymm5
vpaddw %ymm11, %ymm9, %ymm3
vpsubw %ymm11, %ymm9, %ymm11
vpsubw %ymm12, %ymm4, %ymm4
vpaddw %ymm12, %ymm7, %ymm7
vpsubw %ymm13, %ymm6, %ymm6
vpaddw %ymm13, %ymm10, %ymm10
vpsubw %ymm14, %ymm8, %ymm8
vpaddw %ymm14, %ymm5, %ymm5
vpsubw %ymm15, %ymm3, %ymm3
vpaddw %ymm15, %ymm11, %ymm11
vpunpcklqdq %ymm5, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm5[0],ymm8[2],ymm5[2]
vpunpckhqdq %ymm5, %ymm8, %ymm5 # ymm5 = ymm8[1],ymm5[1],ymm8[3],ymm5[3]
vpunpcklqdq %ymm11, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm11[0],ymm3[2],ymm11[2]
vpunpckhqdq %ymm11, %ymm3, %ymm11 # ymm11 = ymm3[1],ymm11[1],ymm3[3],ymm11[3]
vmovdqa 0x3a0(%rsi), %ymm15
vmovdqa 0x3c0(%rsi), %ymm2
vpmullw %ymm15, %ymm9, %ymm12
vpmullw %ymm15, %ymm5, %ymm13
vpmullw %ymm15, %ymm8, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm9, %ymm9
vpmulhw %ymm2, %ymm5, %ymm5
vpmulhw %ymm2, %ymm8, %ymm8
vpmulhw %ymm2, %ymm11, %ymm11
vpunpcklqdq %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm7[0],ymm4[2],ymm7[2]
vpunpckhqdq %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[1],ymm7[1],ymm4[3],ymm7[3]
vpunpcklqdq %ymm10, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm10[0],ymm6[2],ymm10[2]
vpunpckhqdq %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[1],ymm10[1],ymm6[3],ymm10[3]
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm9, %ymm3, %ymm6
vpsubw %ymm9, %ymm3, %ymm9
vpaddw %ymm5, %ymm7, %ymm3
vpsubw %ymm5, %ymm7, %ymm5
vpaddw %ymm8, %ymm4, %ymm7
vpsubw %ymm8, %ymm4, %ymm8
vpaddw %ymm11, %ymm10, %ymm4
vpsubw %ymm11, %ymm10, %ymm11
vpsubw %ymm12, %ymm6, %ymm6
vpaddw %ymm12, %ymm9, %ymm9
vpsubw %ymm13, %ymm3, %ymm3
vpaddw %ymm13, %ymm5, %ymm5
vpsubw %ymm14, %ymm7, %ymm7
vpaddw %ymm14, %ymm8, %ymm8
vpsubw %ymm15, %ymm4, %ymm4
vpaddw %ymm15, %ymm11, %ymm11
vmovsldup %ymm8, %ymm10 # ymm10 = ymm8[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm10, %ymm7, %ymm10 # ymm10 = ymm7[0],ymm10[1],ymm7[2],ymm10[3],ymm7[4],ymm10[5],ymm7[6],ymm10[7]
vpsrlq $0x20, %ymm7, %ymm7
vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
vmovsldup %ymm11, %ymm7 # ymm7 = ymm11[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7]
vpsrlq $0x20, %ymm4, %ymm4
vpblendd $0xaa, %ymm11, %ymm4, %ymm11 # ymm11 = ymm4[0],ymm11[1],ymm4[2],ymm11[3],ymm4[4],ymm11[5],ymm4[6],ymm11[7]
vmovdqa 0x3e0(%rsi), %ymm15
vmovdqa 0x400(%rsi), %ymm2
vpmullw %ymm15, %ymm10, %ymm12
vpmullw %ymm15, %ymm8, %ymm13
vpmullw %ymm15, %ymm7, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm10, %ymm10
vpmulhw %ymm2, %ymm8, %ymm8
vpmulhw %ymm2, %ymm7, %ymm7
vpmulhw %ymm2, %ymm11, %ymm11
vmovsldup %ymm9, %ymm4 # ymm4 = ymm9[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
vpsrlq $0x20, %ymm6, %ymm6
vpblendd $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7]
vmovsldup %ymm5, %ymm6 # ymm6 = ymm5[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7]
vpsrlq $0x20, %ymm3, %ymm3
vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm10, %ymm4, %ymm3
vpsubw %ymm10, %ymm4, %ymm10
vpaddw %ymm8, %ymm9, %ymm4
vpsubw %ymm8, %ymm9, %ymm8
vpaddw %ymm7, %ymm6, %ymm9
vpsubw %ymm7, %ymm6, %ymm7
vpaddw %ymm11, %ymm5, %ymm6
vpsubw %ymm11, %ymm5, %ymm11
vpsubw %ymm12, %ymm3, %ymm3
vpaddw %ymm12, %ymm10, %ymm10
vpsubw %ymm13, %ymm4, %ymm4
vpaddw %ymm13, %ymm8, %ymm8
vpsubw %ymm14, %ymm9, %ymm9
vpaddw %ymm14, %ymm7, %ymm7
vpsubw %ymm15, %ymm6, %ymm6
vpaddw %ymm15, %ymm11, %ymm11
vpslld $0x10, %ymm7, %ymm5
vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
vpsrld $0x10, %ymm9, %ymm9
vpblendw $0xaa, %ymm7, %ymm9, %ymm7 # ymm7 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4],ymm7[5],ymm9[6],ymm7[7],ymm9[8],ymm7[9],ymm9[10],ymm7[11],ymm9[12],ymm7[13],ymm9[14],ymm7[15]
vpslld $0x10, %ymm11, %ymm9
vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
vpsrld $0x10, %ymm6, %ymm6
vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
vmovdqa 0x420(%rsi), %ymm15
vmovdqa 0x440(%rsi), %ymm2
vpmullw %ymm15, %ymm5, %ymm12
vpmullw %ymm15, %ymm7, %ymm13
vpmullw %ymm15, %ymm9, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm2, %ymm5, %ymm5
vpmulhw %ymm2, %ymm7, %ymm7
vpmulhw %ymm2, %ymm9, %ymm9
vpmulhw %ymm2, %ymm11, %ymm11
vpslld $0x10, %ymm10, %ymm6
vpblendw $0xaa, %ymm6, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm6[1],ymm3[2],ymm6[3],ymm3[4],ymm6[5],ymm3[6],ymm6[7],ymm3[8],ymm6[9],ymm3[10],ymm6[11],ymm3[12],ymm6[13],ymm3[14],ymm6[15]
vpsrld $0x10, %ymm3, %ymm3
vpblendw $0xaa, %ymm10, %ymm3, %ymm10 # ymm10 = ymm3[0],ymm10[1],ymm3[2],ymm10[3],ymm3[4],ymm10[5],ymm3[6],ymm10[7],ymm3[8],ymm10[9],ymm3[10],ymm10[11],ymm3[12],ymm10[13],ymm3[14],ymm10[15]
vpslld $0x10, %ymm8, %ymm3
vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
vpsrld $0x10, %ymm4, %ymm4
vpblendw $0xaa, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm8[1],ymm4[2],ymm8[3],ymm4[4],ymm8[5],ymm4[6],ymm8[7],ymm4[8],ymm8[9],ymm4[10],ymm8[11],ymm4[12],ymm8[13],ymm4[14],ymm8[15]
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm5, %ymm6, %ymm4
vpsubw %ymm5, %ymm6, %ymm5
vpaddw %ymm7, %ymm10, %ymm6
vpsubw %ymm7, %ymm10, %ymm7
vpaddw %ymm9, %ymm3, %ymm10
vpsubw %ymm9, %ymm3, %ymm9
vpaddw %ymm11, %ymm8, %ymm3
vpsubw %ymm11, %ymm8, %ymm11
vpsubw %ymm12, %ymm4, %ymm4
vpaddw %ymm12, %ymm5, %ymm5
vpsubw %ymm13, %ymm6, %ymm6
vpaddw %ymm13, %ymm7, %ymm7
vpsubw %ymm14, %ymm10, %ymm10
vpaddw %ymm14, %ymm9, %ymm9
vpsubw %ymm15, %ymm3, %ymm3
vpaddw %ymm15, %ymm11, %ymm11
vmovdqa 0x460(%rsi), %ymm14
vmovdqa 0x4a0(%rsi), %ymm15
vmovdqa 0x480(%rsi), %ymm8
vmovdqa 0x4c0(%rsi), %ymm2
vpmullw %ymm14, %ymm10, %ymm12
vpmullw %ymm14, %ymm3, %ymm13
vpmullw %ymm15, %ymm9, %ymm14
vpmullw %ymm15, %ymm11, %ymm15
vpmulhw %ymm8, %ymm10, %ymm10
vpmulhw %ymm8, %ymm3, %ymm3
vpmulhw %ymm2, %ymm9, %ymm9
vpmulhw %ymm2, %ymm11, %ymm11
vpmulhw %ymm0, %ymm12, %ymm12
vpmulhw %ymm0, %ymm13, %ymm13
vpmulhw %ymm0, %ymm14, %ymm14
vpmulhw %ymm0, %ymm15, %ymm15
vpaddw %ymm10, %ymm4, %ymm8
vpsubw %ymm10, %ymm4, %ymm10
vpaddw %ymm3, %ymm6, %ymm4
vpsubw %ymm3, %ymm6, %ymm3
vpaddw %ymm9, %ymm5, %ymm6
vpsubw %ymm9, %ymm5, %ymm9
vpaddw %ymm11, %ymm7, %ymm5
vpsubw %ymm11, %ymm7, %ymm11
vpsubw %ymm12, %ymm8, %ymm8
vpaddw %ymm12, %ymm10, %ymm10
vpsubw %ymm13, %ymm4, %ymm4
vpaddw %ymm13, %ymm3, %ymm3
vpsubw %ymm14, %ymm6, %ymm6
vpaddw %ymm14, %ymm9, %ymm9
vpsubw %ymm15, %ymm5, %ymm5
vpaddw %ymm15, %ymm11, %ymm11
vmovdqa %ymm8, 0x100(%rdi)
vmovdqa %ymm4, 0x120(%rdi)
vmovdqa %ymm10, 0x140(%rdi)
vmovdqa %ymm3, 0x160(%rdi)
vmovdqa %ymm6, 0x180(%rdi)
vmovdqa %ymm5, 0x1a0(%rdi)
vmovdqa %ymm9, 0x1c0(%rdi)
vmovdqa %ymm11, 0x1e0(%rdi)
retq
.cfi_endproc

View File

@@ -0,0 +1,120 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Kyber optimized AVX2 implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2].
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/nttfrombytes.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_nttfrombytes_avx2)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_nttfrombytes_avx2)
S2N_BN_SYMBOL(mlkem_nttfrombytes_avx2):
.cfi_startproc
vmovdqa 0xe0(%rdx), %ymm0
callq Lnttfrombytes_avx2_core
addq $0x100, %rdi # imm = 0x100
addq $0xc0, %rsi
callq Lnttfrombytes_avx2_core
retq
.cfi_endproc
Lnttfrombytes_avx2_core:
.cfi_startproc
vmovdqu (%rsi), %ymm4
vmovdqu 0x20(%rsi), %ymm5
vmovdqu 0x40(%rsi), %ymm6
vmovdqu 0x60(%rsi), %ymm7
vmovdqu 0x80(%rsi), %ymm8
vmovdqu 0xa0(%rsi), %ymm9
vperm2i128 $0x20, %ymm7, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm7[0,1]
vperm2i128 $0x31, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[2,3],ymm7[2,3]
vperm2i128 $0x20, %ymm8, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm8[0,1]
vperm2i128 $0x31, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[2,3],ymm8[2,3]
vperm2i128 $0x20, %ymm9, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm9[0,1]
vperm2i128 $0x31, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[2,3],ymm9[2,3]
vpunpcklqdq %ymm8, %ymm3, %ymm6 # ymm6 = ymm3[0],ymm8[0],ymm3[2],ymm8[2]
vpunpckhqdq %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[1],ymm8[1],ymm3[3],ymm8[3]
vpunpcklqdq %ymm5, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm5[0],ymm7[2],ymm5[2]
vpunpckhqdq %ymm5, %ymm7, %ymm5 # ymm5 = ymm7[1],ymm5[1],ymm7[3],ymm5[3]
vpunpcklqdq %ymm9, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm9[0],ymm4[2],ymm9[2]
vpunpckhqdq %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[1],ymm9[1],ymm4[3],ymm9[3]
vmovsldup %ymm5, %ymm4 # ymm4 = ymm5[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7]
vpsrlq $0x20, %ymm6, %ymm6
vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
vmovsldup %ymm7, %ymm6 # ymm6 = ymm7[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
vpsrlq $0x20, %ymm8, %ymm8
vpblendd $0xaa, %ymm7, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4],ymm7[5],ymm8[6],ymm7[7]
vmovsldup %ymm9, %ymm8 # ymm8 = ymm9[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
vpsrlq $0x20, %ymm3, %ymm3
vpblendd $0xaa, %ymm9, %ymm3, %ymm9 # ymm9 = ymm3[0],ymm9[1],ymm3[2],ymm9[3],ymm3[4],ymm9[5],ymm3[6],ymm9[7]
vpslld $0x10, %ymm7, %ymm10
vpblendw $0xaa, %ymm10, %ymm4, %ymm10 # ymm10 = ymm4[0],ymm10[1],ymm4[2],ymm10[3],ymm4[4],ymm10[5],ymm4[6],ymm10[7],ymm4[8],ymm10[9],ymm4[10],ymm10[11],ymm4[12],ymm10[13],ymm4[14],ymm10[15]
vpsrld $0x10, %ymm4, %ymm4
vpblendw $0xaa, %ymm7, %ymm4, %ymm7 # ymm7 = ymm4[0],ymm7[1],ymm4[2],ymm7[3],ymm4[4],ymm7[5],ymm4[6],ymm7[7],ymm4[8],ymm7[9],ymm4[10],ymm7[11],ymm4[12],ymm7[13],ymm4[14],ymm7[15]
vpslld $0x10, %ymm8, %ymm4
vpblendw $0xaa, %ymm4, %ymm5, %ymm4 # ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4],ymm4[5],ymm5[6],ymm4[7],ymm5[8],ymm4[9],ymm5[10],ymm4[11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
vpsrld $0x10, %ymm5, %ymm5
vpblendw $0xaa, %ymm8, %ymm5, %ymm8 # ymm8 = ymm5[0],ymm8[1],ymm5[2],ymm8[3],ymm5[4],ymm8[5],ymm5[6],ymm8[7],ymm5[8],ymm8[9],ymm5[10],ymm8[11],ymm5[12],ymm8[13],ymm5[14],ymm8[15]
vpslld $0x10, %ymm9, %ymm5
vpblendw $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7],ymm6[8],ymm5[9],ymm6[10],ymm5[11],ymm6[12],ymm5[13],ymm6[14],ymm5[15]
vpsrld $0x10, %ymm6, %ymm6
vpblendw $0xaa, %ymm9, %ymm6, %ymm9 # ymm9 = ymm6[0],ymm9[1],ymm6[2],ymm9[3],ymm6[4],ymm9[5],ymm6[6],ymm9[7],ymm6[8],ymm9[9],ymm6[10],ymm9[11],ymm6[12],ymm9[13],ymm6[14],ymm9[15]
vpsrlw $0xc, %ymm10, %ymm11
vpsllw $0x4, %ymm7, %ymm12
vpor %ymm11, %ymm12, %ymm11
vpand %ymm0, %ymm10, %ymm10
vpand %ymm0, %ymm11, %ymm11
vpsrlw $0x8, %ymm7, %ymm12
vpsllw $0x8, %ymm4, %ymm13
vpor %ymm12, %ymm13, %ymm12
vpand %ymm0, %ymm12, %ymm12
vpsrlw $0x4, %ymm4, %ymm13
vpand %ymm0, %ymm13, %ymm13
vpsrlw $0xc, %ymm8, %ymm14
vpsllw $0x4, %ymm5, %ymm15
vpor %ymm14, %ymm15, %ymm14
vpand %ymm0, %ymm8, %ymm8
vpand %ymm0, %ymm14, %ymm14
vpsrlw $0x8, %ymm5, %ymm15
vpsllw $0x8, %ymm9, %ymm1
vpor %ymm15, %ymm1, %ymm15
vpand %ymm0, %ymm15, %ymm15
vpsrlw $0x4, %ymm9, %ymm1
vpand %ymm0, %ymm1, %ymm1
vmovdqa %ymm10, (%rdi)
vmovdqa %ymm11, 0x20(%rdi)
vmovdqa %ymm12, 0x40(%rdi)
vmovdqa %ymm13, 0x60(%rdi)
vmovdqa %ymm8, 0x80(%rdi)
vmovdqa %ymm14, 0xa0(%rdi)
vmovdqa %ymm15, 0xc0(%rdi)
vmovdqa %ymm1, 0xe0(%rdi)
retq
.cfi_endproc

View File

@@ -0,0 +1,114 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Kyber optimized AVX2 implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2].
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/ntttobytes.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_ntttobytes_avx2)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_ntttobytes_avx2)
S2N_BN_SYMBOL(mlkem_ntttobytes_avx2):
.cfi_startproc
vmovdqa (%rdx), %ymm0
callq Lntttobytes_avx2_core
addq $0x100, %rsi # imm = 0x100
addq $0xc0, %rdi
callq Lntttobytes_avx2_core
retq
.cfi_endproc
Lntttobytes_avx2_core:
.cfi_startproc
vmovdqa (%rsi), %ymm5
vmovdqa 0x20(%rsi), %ymm6
vmovdqa 0x40(%rsi), %ymm7
vmovdqa 0x60(%rsi), %ymm8
vmovdqa 0x80(%rsi), %ymm9
vmovdqa 0xa0(%rsi), %ymm10
vmovdqa 0xc0(%rsi), %ymm11
vmovdqa 0xe0(%rsi), %ymm12
vpsllw $0xc, %ymm6, %ymm4
vpor %ymm4, %ymm5, %ymm4
vpsrlw $0x4, %ymm6, %ymm5
vpsllw $0x8, %ymm7, %ymm6
vpor %ymm5, %ymm6, %ymm5
vpsrlw $0x8, %ymm7, %ymm6
vpsllw $0x4, %ymm8, %ymm7
vpor %ymm6, %ymm7, %ymm6
vpsllw $0xc, %ymm10, %ymm7
vpor %ymm7, %ymm9, %ymm7
vpsrlw $0x4, %ymm10, %ymm8
vpsllw $0x8, %ymm11, %ymm9
vpor %ymm8, %ymm9, %ymm8
vpsrlw $0x8, %ymm11, %ymm9
vpsllw $0x4, %ymm12, %ymm10
vpor %ymm9, %ymm10, %ymm9
vpslld $0x10, %ymm5, %ymm3
vpblendw $0xaa, %ymm3, %ymm4, %ymm3 # ymm3 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4],ymm3[5],ymm4[6],ymm3[7],ymm4[8],ymm3[9],ymm4[10],ymm3[11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
vpsrld $0x10, %ymm4, %ymm4
vpblendw $0xaa, %ymm5, %ymm4, %ymm5 # ymm5 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4],ymm5[5],ymm4[6],ymm5[7],ymm4[8],ymm5[9],ymm4[10],ymm5[11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
vpslld $0x10, %ymm7, %ymm4
vpblendw $0xaa, %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4],ymm4[5],ymm6[6],ymm4[7],ymm6[8],ymm4[9],ymm6[10],ymm4[11],ymm6[12],ymm4[13],ymm6[14],ymm4[15]
vpsrld $0x10, %ymm6, %ymm6
vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
vpslld $0x10, %ymm9, %ymm6
vpblendw $0xaa, %ymm6, %ymm8, %ymm6 # ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7],ymm8[8],ymm6[9],ymm8[10],ymm6[11],ymm8[12],ymm6[13],ymm8[14],ymm6[15]
vpsrld $0x10, %ymm8, %ymm8
vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
vmovsldup %ymm4, %ymm8 # ymm8 = ymm4[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm8, %ymm3, %ymm8 # ymm8 = ymm3[0],ymm8[1],ymm3[2],ymm8[3],ymm3[4],ymm8[5],ymm3[6],ymm8[7]
vpsrlq $0x20, %ymm3, %ymm3
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
vmovsldup %ymm5, %ymm3 # ymm3 = ymm5[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4],ymm3[5],ymm6[6],ymm3[7]
vpsrlq $0x20, %ymm6, %ymm6
vpblendd $0xaa, %ymm5, %ymm6, %ymm5 # ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4],ymm5[5],ymm6[6],ymm5[7]
vmovsldup %ymm9, %ymm6 # ymm6 = ymm9[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm6, %ymm7, %ymm6 # ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
vpsrlq $0x20, %ymm7, %ymm7
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
vpunpcklqdq %ymm3, %ymm8, %ymm7 # ymm7 = ymm8[0],ymm3[0],ymm8[2],ymm3[2]
vpunpckhqdq %ymm3, %ymm8, %ymm3 # ymm3 = ymm8[1],ymm3[1],ymm8[3],ymm3[3]
vpunpcklqdq %ymm4, %ymm6, %ymm8 # ymm8 = ymm6[0],ymm4[0],ymm6[2],ymm4[2]
vpunpckhqdq %ymm4, %ymm6, %ymm4 # ymm4 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
vpunpcklqdq %ymm9, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm9[0],ymm5[2],ymm9[2]
vpunpckhqdq %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[1],ymm9[1],ymm5[3],ymm9[3]
vperm2i128 $0x20, %ymm8, %ymm7, %ymm5 # ymm5 = ymm7[0,1],ymm8[0,1]
vperm2i128 $0x31, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[2,3],ymm8[2,3]
vperm2i128 $0x20, %ymm3, %ymm6, %ymm7 # ymm7 = ymm6[0,1],ymm3[0,1]
vperm2i128 $0x31, %ymm3, %ymm6, %ymm3 # ymm3 = ymm6[2,3],ymm3[2,3]
vperm2i128 $0x20, %ymm9, %ymm4, %ymm6 # ymm6 = ymm4[0,1],ymm9[0,1]
vperm2i128 $0x31, %ymm9, %ymm4, %ymm9 # ymm9 = ymm4[2,3],ymm9[2,3]
vmovdqu %ymm5, (%rdi)
vmovdqu %ymm7, 0x20(%rdi)
vmovdqu %ymm6, 0x40(%rdi)
vmovdqu %ymm8, 0x60(%rdi)
vmovdqu %ymm3, 0x80(%rdi)
vmovdqu %ymm9, 0xa0(%rdi)
retq
.cfi_endproc

View File

@@ -0,0 +1,110 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Kyber optimized AVX2 implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2].
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/nttunpack.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_nttunpack_avx2)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_nttunpack_avx2)
S2N_BN_SYMBOL(mlkem_nttunpack_avx2):
.cfi_startproc
callq Lnttunpack_avx2_core
addq $0x100, %rdi # imm = 0x100
callq Lnttunpack_avx2_core
retq
.cfi_endproc
Lnttunpack_avx2_core:
.cfi_startproc
vmovdqa (%rdi), %ymm4
vmovdqa 0x20(%rdi), %ymm5
vmovdqa 0x40(%rdi), %ymm6
vmovdqa 0x60(%rdi), %ymm7
vmovdqa 0x80(%rdi), %ymm8
vmovdqa 0xa0(%rdi), %ymm9
vmovdqa 0xc0(%rdi), %ymm10
vmovdqa 0xe0(%rdi), %ymm11
vperm2i128 $0x20, %ymm8, %ymm4, %ymm3 # ymm3 = ymm4[0,1],ymm8[0,1]
vperm2i128 $0x31, %ymm8, %ymm4, %ymm8 # ymm8 = ymm4[2,3],ymm8[2,3]
vperm2i128 $0x20, %ymm9, %ymm5, %ymm4 # ymm4 = ymm5[0,1],ymm9[0,1]
vperm2i128 $0x31, %ymm9, %ymm5, %ymm9 # ymm9 = ymm5[2,3],ymm9[2,3]
vperm2i128 $0x20, %ymm10, %ymm6, %ymm5 # ymm5 = ymm6[0,1],ymm10[0,1]
vperm2i128 $0x31, %ymm10, %ymm6, %ymm10 # ymm10 = ymm6[2,3],ymm10[2,3]
vperm2i128 $0x20, %ymm11, %ymm7, %ymm6 # ymm6 = ymm7[0,1],ymm11[0,1]
vperm2i128 $0x31, %ymm11, %ymm7, %ymm11 # ymm11 = ymm7[2,3],ymm11[2,3]
vpunpcklqdq %ymm5, %ymm3, %ymm7 # ymm7 = ymm3[0],ymm5[0],ymm3[2],ymm5[2]
vpunpckhqdq %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
vpunpcklqdq %ymm10, %ymm8, %ymm3 # ymm3 = ymm8[0],ymm10[0],ymm8[2],ymm10[2]
vpunpckhqdq %ymm10, %ymm8, %ymm10 # ymm10 = ymm8[1],ymm10[1],ymm8[3],ymm10[3]
vpunpcklqdq %ymm6, %ymm4, %ymm8 # ymm8 = ymm4[0],ymm6[0],ymm4[2],ymm6[2]
vpunpckhqdq %ymm6, %ymm4, %ymm6 # ymm6 = ymm4[1],ymm6[1],ymm4[3],ymm6[3]
vpunpcklqdq %ymm11, %ymm9, %ymm4 # ymm4 = ymm9[0],ymm11[0],ymm9[2],ymm11[2]
vpunpckhqdq %ymm11, %ymm9, %ymm11 # ymm11 = ymm9[1],ymm11[1],ymm9[3],ymm11[3]
vmovsldup %ymm8, %ymm9 # ymm9 = ymm8[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm9, %ymm7, %ymm9 # ymm9 = ymm7[0],ymm9[1],ymm7[2],ymm9[3],ymm7[4],ymm9[5],ymm7[6],ymm9[7]
vpsrlq $0x20, %ymm7, %ymm7
vpblendd $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7]
vmovsldup %ymm6, %ymm7 # ymm7 = ymm6[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm7, %ymm5, %ymm7 # ymm7 = ymm5[0],ymm7[1],ymm5[2],ymm7[3],ymm5[4],ymm7[5],ymm5[6],ymm7[7]
vpsrlq $0x20, %ymm5, %ymm5
vpblendd $0xaa, %ymm6, %ymm5, %ymm6 # ymm6 = ymm5[0],ymm6[1],ymm5[2],ymm6[3],ymm5[4],ymm6[5],ymm5[6],ymm6[7]
vmovsldup %ymm4, %ymm5 # ymm5 = ymm4[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm5, %ymm3, %ymm5 # ymm5 = ymm3[0],ymm5[1],ymm3[2],ymm5[3],ymm3[4],ymm5[5],ymm3[6],ymm5[7]
vpsrlq $0x20, %ymm3, %ymm3
vpblendd $0xaa, %ymm4, %ymm3, %ymm4 # ymm4 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4],ymm4[5],ymm3[6],ymm4[7]
vmovsldup %ymm11, %ymm3 # ymm3 = ymm11[0,0,2,2,4,4,6,6]
vpblendd $0xaa, %ymm3, %ymm10, %ymm3 # ymm3 = ymm10[0],ymm3[1],ymm10[2],ymm3[3],ymm10[4],ymm3[5],ymm10[6],ymm3[7]
vpsrlq $0x20, %ymm10, %ymm10
vpblendd $0xaa, %ymm11, %ymm10, %ymm11 # ymm11 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4],ymm11[5],ymm10[6],ymm11[7]
vpslld $0x10, %ymm5, %ymm10
vpblendw $0xaa, %ymm10, %ymm9, %ymm10 # ymm10 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4],ymm10[5],ymm9[6],ymm10[7],ymm9[8],ymm10[9],ymm9[10],ymm10[11],ymm9[12],ymm10[13],ymm9[14],ymm10[15]
vpsrld $0x10, %ymm9, %ymm9
vpblendw $0xaa, %ymm5, %ymm9, %ymm5 # ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7],ymm9[8],ymm5[9],ymm9[10],ymm5[11],ymm9[12],ymm5[13],ymm9[14],ymm5[15]
vpslld $0x10, %ymm4, %ymm9
vpblendw $0xaa, %ymm9, %ymm8, %ymm9 # ymm9 = ymm8[0],ymm9[1],ymm8[2],ymm9[3],ymm8[4],ymm9[5],ymm8[6],ymm9[7],ymm8[8],ymm9[9],ymm8[10],ymm9[11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
vpsrld $0x10, %ymm8, %ymm8
vpblendw $0xaa, %ymm4, %ymm8, %ymm4 # ymm4 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4],ymm4[5],ymm8[6],ymm4[7],ymm8[8],ymm4[9],ymm8[10],ymm4[11],ymm8[12],ymm4[13],ymm8[14],ymm4[15]
vpslld $0x10, %ymm3, %ymm8
vpblendw $0xaa, %ymm8, %ymm7, %ymm8 # ymm8 = ymm7[0],ymm8[1],ymm7[2],ymm8[3],ymm7[4],ymm8[5],ymm7[6],ymm8[7],ymm7[8],ymm8[9],ymm7[10],ymm8[11],ymm7[12],ymm8[13],ymm7[14],ymm8[15]
vpsrld $0x10, %ymm7, %ymm7
vpblendw $0xaa, %ymm3, %ymm7, %ymm3 # ymm3 = ymm7[0],ymm3[1],ymm7[2],ymm3[3],ymm7[4],ymm3[5],ymm7[6],ymm3[7],ymm7[8],ymm3[9],ymm7[10],ymm3[11],ymm7[12],ymm3[13],ymm7[14],ymm3[15]
vpslld $0x10, %ymm11, %ymm7
vpblendw $0xaa, %ymm7, %ymm6, %ymm7 # ymm7 = ymm6[0],ymm7[1],ymm6[2],ymm7[3],ymm6[4],ymm7[5],ymm6[6],ymm7[7],ymm6[8],ymm7[9],ymm6[10],ymm7[11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
vpsrld $0x10, %ymm6, %ymm6
vpblendw $0xaa, %ymm11, %ymm6, %ymm11 # ymm11 = ymm6[0],ymm11[1],ymm6[2],ymm11[3],ymm6[4],ymm11[5],ymm6[6],ymm11[7],ymm6[8],ymm11[9],ymm6[10],ymm11[11],ymm6[12],ymm11[13],ymm6[14],ymm11[15]
vmovdqa %ymm10, (%rdi)
vmovdqa %ymm5, 0x20(%rdi)
vmovdqa %ymm9, 0x40(%rdi)
vmovdqa %ymm4, 0x60(%rdi)
vmovdqa %ymm8, 0x80(%rdi)
vmovdqa %ymm3, 0xa0(%rdi)
vmovdqa %ymm7, 0xc0(%rdi)
vmovdqa %ymm11, 0xe0(%rdi)
retq
.cfi_endproc

View File

@@ -0,0 +1,158 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Kyber optimized AVX2 implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2], but it was significantly modified:
* 1) we are using mulcache (following the aarch64 implementation),
* 2) schoolbook macro was simplified to process one instead of two inputs,
* withouth any performance degradation. This also made it possible to
* avoid using the stack because we managed to fit the implemention
* into 16 available ymm registers,
* 3) we use named variables instead of ymm registers directly.
*/
#define _q %ymm0
#define _qinv %ymm1
#define _a %ymm2
#define _b %ymm3
#define _c %ymm4
#define _d %ymm5
#define _dz %ymm6
#define _r0 %ymm7
#define _r1 %ymm8
#define _s0 %ymm9
#define _s1 %ymm10
#define _ac_hi %ymm11
#define _ad_hi %ymm12
#define _bdz_hi %ymm13
#define _bc_hi %ymm14
#define _a_lo %ymm13
#define _b_lo %ymm14
/* Polynomials to be multiplied are denoted (a + bX) (rsi arg) and
* (c + dX) (rdx arg). The mulcache for (c + dX), that is, the precomputed
* values of (d * zeta) is passed in rcx. We compute:
* (r + sX) = (a + bX) * (c + dX), where in the code we denote
* r = ac + bdz = r0 + r1,
* s = ad + bc = s0 + s1.
*/
.macro schoolbook iter k
vmovdqa (256*\k + 32*\iter + 0)*2(%rsi), _a
vmovdqa (256*\k + 32*\iter + 16)*2(%rsi), _b
vmovdqa (256*\k + 32*\iter + 0)*2(%rdx), _c
vmovdqa (256*\k + 32*\iter + 16)*2(%rdx), _d
vmovdqa (128*\k + 16*\iter)*2(%rcx), _dz
/* Prepare Montgomery twists */
vpmullw _a, _qinv, _a_lo
vpmullw _b, _qinv, _b_lo
/* Compute low-parts of monomials in (a + bX) * (c + dX),
* using Montgomery twists calculated before. */
vpmullw _a_lo, _c, _r0
vpmullw _a_lo, _d, _s0
vpmullw _b_lo, _dz, _r1
vpmullw _b_lo, _c, _s1
/* Compute the second high multiplication in Montgomery multiplication. */
vpmulhw _r0, _q, _r0
vpmulhw _s0, _q, _s0
vpmulhw _r1, _q, _r1
vpmulhw _s1, _q, _s1
/* Compute high-parts of monomials in (a + bX) * (c + dX). */
vpmulhw _a, _c, _ac_hi
vpmulhw _a, _d, _ad_hi
vpmulhw _b, _dz, _bdz_hi
vpmulhw _b, _c, _bc_hi
/* Finish Montgomery multiplications */
vpsubw _r0, _ac_hi, _r0
vpsubw _s0, _ad_hi, _s0
.if \iter & 1 /* Every other (d * zeta) is stored negative */
vpsubw _bdz_hi, _r1, _r1
.else
vpsubw _r1, _bdz_hi, _r1
.endif
vpsubw _s1, _bc_hi, _s1
vpaddw _r0, _r1, _r0
vpaddw _s0, _s1, _s0
.if \k > 0
vmovdqa (32*\iter + 0)*2(%rdi), _r1
vmovdqa (32*\iter + 16)*2(%rdi), _s1
vpaddw _r0, _r1, _r0
vpaddw _s0, _s1, _s0
.endif
vmovdqa _r0, (32*\iter + 0)*2(%rdi)
vmovdqa _s0, (32*\iter + 16)*2(%rdi)
/* Bounds. The only assumptions we make on the input are:
* abs(a, b) < 2^12 and abs(c, d, dz) <= 2^15.
* Therefore we have that the products a*c, b*dz, a*d, b*c are bounded by 3713.
* For example,
* a*c <= ceil(abs(a) * abs(c) / 2^16) + (Q + 1)/2
* <= ceil(2^12 * 2*15 / 2^16) + (3329 + 1)/2
* = 3713
* In the worst case, we accumulate 8 such products, which bounds the output
* coefficients to 8 * 3713 which is less than 2^15 and fits in int16_t.
*/
.endm
.macro poly_basemul k
schoolbook 0 \k
schoolbook 1 \k
schoolbook 2 \k
schoolbook 3 \k
schoolbook 4 \k
schoolbook 5 \k
schoolbook 6 \k
schoolbook 7 \k
.endm
.macro polyvec_basemul k
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XQ*2(%r8), _q
vmovdqa MLK_AVX2_BACKEND_DATA_OFFSET_16XQINV*2(%r8), _qinv
.if \k > 0
poly_basemul 0
.endif
.if \k > 1
poly_basemul 1
.endif
.if \k > 2
poly_basemul 2
.endif
.if \k > 3
poly_basemul 3
.endif
.endm
#undef _q
#undef _qinv
#undef _a
#undef _b
#undef _c
#undef _d
#undef _dz
#undef _r0
#undef _r1
#undef _s0
#undef _s1
#undef _ac_hi
#undef _ad_hi
#undef _bdz_hi
#undef _bc_hi
#undef _a_lo
#undef _b_lo

View File

@@ -0,0 +1,489 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k2.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k2)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k2)
S2N_BN_SYMBOL(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k2):
.cfi_startproc
vmovdqa (%r8), %ymm0
vmovdqa 0x20(%r8), %ymm1
vmovdqa (%rsi), %ymm2
vmovdqa 0x20(%rsi), %ymm3
vmovdqa (%rdx), %ymm4
vmovdqa 0x20(%rdx), %ymm5
vmovdqa (%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, (%rdi)
vmovdqa %ymm9, 0x20(%rdi)
vmovdqa 0x40(%rsi), %ymm2
vmovdqa 0x60(%rsi), %ymm3
vmovdqa 0x40(%rdx), %ymm4
vmovdqa 0x60(%rdx), %ymm5
vmovdqa 0x20(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x40(%rdi)
vmovdqa %ymm9, 0x60(%rdi)
vmovdqa 0x80(%rsi), %ymm2
vmovdqa 0xa0(%rsi), %ymm3
vmovdqa 0x80(%rdx), %ymm4
vmovdqa 0xa0(%rdx), %ymm5
vmovdqa 0x40(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x80(%rdi)
vmovdqa %ymm9, 0xa0(%rdi)
vmovdqa 0xc0(%rsi), %ymm2
vmovdqa 0xe0(%rsi), %ymm3
vmovdqa 0xc0(%rdx), %ymm4
vmovdqa 0xe0(%rdx), %ymm5
vmovdqa 0x60(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0xc0(%rdi)
vmovdqa %ymm9, 0xe0(%rdi)
vmovdqa 0x100(%rsi), %ymm2
vmovdqa 0x120(%rsi), %ymm3
vmovdqa 0x100(%rdx), %ymm4
vmovdqa 0x120(%rdx), %ymm5
vmovdqa 0x80(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa 0x140(%rsi), %ymm2
vmovdqa 0x160(%rsi), %ymm3
vmovdqa 0x140(%rdx), %ymm4
vmovdqa 0x160(%rdx), %ymm5
vmovdqa 0xa0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x140(%rdi)
vmovdqa %ymm9, 0x160(%rdi)
vmovdqa 0x180(%rsi), %ymm2
vmovdqa 0x1a0(%rsi), %ymm3
vmovdqa 0x180(%rdx), %ymm4
vmovdqa 0x1a0(%rdx), %ymm5
vmovdqa 0xc0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x180(%rdi)
vmovdqa %ymm9, 0x1a0(%rdi)
vmovdqa 0x1c0(%rsi), %ymm2
vmovdqa 0x1e0(%rsi), %ymm3
vmovdqa 0x1c0(%rdx), %ymm4
vmovdqa 0x1e0(%rdx), %ymm5
vmovdqa 0xe0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x1c0(%rdi)
vmovdqa %ymm9, 0x1e0(%rdi)
vmovdqa 0x200(%rsi), %ymm2
vmovdqa 0x220(%rsi), %ymm3
vmovdqa 0x200(%rdx), %ymm4
vmovdqa 0x220(%rdx), %ymm5
vmovdqa 0x100(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa (%rdi), %ymm8
vmovdqa 0x20(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, (%rdi)
vmovdqa %ymm9, 0x20(%rdi)
vmovdqa 0x240(%rsi), %ymm2
vmovdqa 0x260(%rsi), %ymm3
vmovdqa 0x240(%rdx), %ymm4
vmovdqa 0x260(%rdx), %ymm5
vmovdqa 0x120(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x40(%rdi), %ymm8
vmovdqa 0x60(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x40(%rdi)
vmovdqa %ymm9, 0x60(%rdi)
vmovdqa 0x280(%rsi), %ymm2
vmovdqa 0x2a0(%rsi), %ymm3
vmovdqa 0x280(%rdx), %ymm4
vmovdqa 0x2a0(%rdx), %ymm5
vmovdqa 0x140(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x80(%rdi), %ymm8
vmovdqa 0xa0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x80(%rdi)
vmovdqa %ymm9, 0xa0(%rdi)
vmovdqa 0x2c0(%rsi), %ymm2
vmovdqa 0x2e0(%rsi), %ymm3
vmovdqa 0x2c0(%rdx), %ymm4
vmovdqa 0x2e0(%rdx), %ymm5
vmovdqa 0x160(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0xc0(%rdi), %ymm8
vmovdqa 0xe0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0xc0(%rdi)
vmovdqa %ymm9, 0xe0(%rdi)
vmovdqa 0x300(%rsi), %ymm2
vmovdqa 0x320(%rsi), %ymm3
vmovdqa 0x300(%rdx), %ymm4
vmovdqa 0x320(%rdx), %ymm5
vmovdqa 0x180(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x100(%rdi), %ymm8
vmovdqa 0x120(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa 0x340(%rsi), %ymm2
vmovdqa 0x360(%rsi), %ymm3
vmovdqa 0x340(%rdx), %ymm4
vmovdqa 0x360(%rdx), %ymm5
vmovdqa 0x1a0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x140(%rdi), %ymm8
vmovdqa 0x160(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x140(%rdi)
vmovdqa %ymm9, 0x160(%rdi)
vmovdqa 0x380(%rsi), %ymm2
vmovdqa 0x3a0(%rsi), %ymm3
vmovdqa 0x380(%rdx), %ymm4
vmovdqa 0x3a0(%rdx), %ymm5
vmovdqa 0x1c0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x180(%rdi), %ymm8
vmovdqa 0x1a0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x180(%rdi)
vmovdqa %ymm9, 0x1a0(%rdi)
vmovdqa 0x3c0(%rsi), %ymm2
vmovdqa 0x3e0(%rsi), %ymm3
vmovdqa 0x3c0(%rdx), %ymm4
vmovdqa 0x3e0(%rdx), %ymm5
vmovdqa 0x1e0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x1c0(%rdi), %ymm8
vmovdqa 0x1e0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x1c0(%rdi)
vmovdqa %ymm9, 0x1e0(%rdi)
retq
.cfi_endproc

View File

@@ -0,0 +1,737 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k3.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k3)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k3)
S2N_BN_SYMBOL(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k3):
.cfi_startproc
vmovdqa (%r8), %ymm0
vmovdqa 0x20(%r8), %ymm1
vmovdqa (%rsi), %ymm2
vmovdqa 0x20(%rsi), %ymm3
vmovdqa (%rdx), %ymm4
vmovdqa 0x20(%rdx), %ymm5
vmovdqa (%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, (%rdi)
vmovdqa %ymm9, 0x20(%rdi)
vmovdqa 0x40(%rsi), %ymm2
vmovdqa 0x60(%rsi), %ymm3
vmovdqa 0x40(%rdx), %ymm4
vmovdqa 0x60(%rdx), %ymm5
vmovdqa 0x20(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x40(%rdi)
vmovdqa %ymm9, 0x60(%rdi)
vmovdqa 0x80(%rsi), %ymm2
vmovdqa 0xa0(%rsi), %ymm3
vmovdqa 0x80(%rdx), %ymm4
vmovdqa 0xa0(%rdx), %ymm5
vmovdqa 0x40(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x80(%rdi)
vmovdqa %ymm9, 0xa0(%rdi)
vmovdqa 0xc0(%rsi), %ymm2
vmovdqa 0xe0(%rsi), %ymm3
vmovdqa 0xc0(%rdx), %ymm4
vmovdqa 0xe0(%rdx), %ymm5
vmovdqa 0x60(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0xc0(%rdi)
vmovdqa %ymm9, 0xe0(%rdi)
vmovdqa 0x100(%rsi), %ymm2
vmovdqa 0x120(%rsi), %ymm3
vmovdqa 0x100(%rdx), %ymm4
vmovdqa 0x120(%rdx), %ymm5
vmovdqa 0x80(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa 0x140(%rsi), %ymm2
vmovdqa 0x160(%rsi), %ymm3
vmovdqa 0x140(%rdx), %ymm4
vmovdqa 0x160(%rdx), %ymm5
vmovdqa 0xa0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x140(%rdi)
vmovdqa %ymm9, 0x160(%rdi)
vmovdqa 0x180(%rsi), %ymm2
vmovdqa 0x1a0(%rsi), %ymm3
vmovdqa 0x180(%rdx), %ymm4
vmovdqa 0x1a0(%rdx), %ymm5
vmovdqa 0xc0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x180(%rdi)
vmovdqa %ymm9, 0x1a0(%rdi)
vmovdqa 0x1c0(%rsi), %ymm2
vmovdqa 0x1e0(%rsi), %ymm3
vmovdqa 0x1c0(%rdx), %ymm4
vmovdqa 0x1e0(%rdx), %ymm5
vmovdqa 0xe0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x1c0(%rdi)
vmovdqa %ymm9, 0x1e0(%rdi)
vmovdqa 0x200(%rsi), %ymm2
vmovdqa 0x220(%rsi), %ymm3
vmovdqa 0x200(%rdx), %ymm4
vmovdqa 0x220(%rdx), %ymm5
vmovdqa 0x100(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa (%rdi), %ymm8
vmovdqa 0x20(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, (%rdi)
vmovdqa %ymm9, 0x20(%rdi)
vmovdqa 0x240(%rsi), %ymm2
vmovdqa 0x260(%rsi), %ymm3
vmovdqa 0x240(%rdx), %ymm4
vmovdqa 0x260(%rdx), %ymm5
vmovdqa 0x120(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x40(%rdi), %ymm8
vmovdqa 0x60(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x40(%rdi)
vmovdqa %ymm9, 0x60(%rdi)
vmovdqa 0x280(%rsi), %ymm2
vmovdqa 0x2a0(%rsi), %ymm3
vmovdqa 0x280(%rdx), %ymm4
vmovdqa 0x2a0(%rdx), %ymm5
vmovdqa 0x140(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x80(%rdi), %ymm8
vmovdqa 0xa0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x80(%rdi)
vmovdqa %ymm9, 0xa0(%rdi)
vmovdqa 0x2c0(%rsi), %ymm2
vmovdqa 0x2e0(%rsi), %ymm3
vmovdqa 0x2c0(%rdx), %ymm4
vmovdqa 0x2e0(%rdx), %ymm5
vmovdqa 0x160(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0xc0(%rdi), %ymm8
vmovdqa 0xe0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0xc0(%rdi)
vmovdqa %ymm9, 0xe0(%rdi)
vmovdqa 0x300(%rsi), %ymm2
vmovdqa 0x320(%rsi), %ymm3
vmovdqa 0x300(%rdx), %ymm4
vmovdqa 0x320(%rdx), %ymm5
vmovdqa 0x180(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x100(%rdi), %ymm8
vmovdqa 0x120(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa 0x340(%rsi), %ymm2
vmovdqa 0x360(%rsi), %ymm3
vmovdqa 0x340(%rdx), %ymm4
vmovdqa 0x360(%rdx), %ymm5
vmovdqa 0x1a0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x140(%rdi), %ymm8
vmovdqa 0x160(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x140(%rdi)
vmovdqa %ymm9, 0x160(%rdi)
vmovdqa 0x380(%rsi), %ymm2
vmovdqa 0x3a0(%rsi), %ymm3
vmovdqa 0x380(%rdx), %ymm4
vmovdqa 0x3a0(%rdx), %ymm5
vmovdqa 0x1c0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x180(%rdi), %ymm8
vmovdqa 0x1a0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x180(%rdi)
vmovdqa %ymm9, 0x1a0(%rdi)
vmovdqa 0x3c0(%rsi), %ymm2
vmovdqa 0x3e0(%rsi), %ymm3
vmovdqa 0x3c0(%rdx), %ymm4
vmovdqa 0x3e0(%rdx), %ymm5
vmovdqa 0x1e0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x1c0(%rdi), %ymm8
vmovdqa 0x1e0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x1c0(%rdi)
vmovdqa %ymm9, 0x1e0(%rdi)
vmovdqa 0x400(%rsi), %ymm2
vmovdqa 0x420(%rsi), %ymm3
vmovdqa 0x400(%rdx), %ymm4
vmovdqa 0x420(%rdx), %ymm5
vmovdqa 0x200(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa (%rdi), %ymm8
vmovdqa 0x20(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, (%rdi)
vmovdqa %ymm9, 0x20(%rdi)
vmovdqa 0x440(%rsi), %ymm2
vmovdqa 0x460(%rsi), %ymm3
vmovdqa 0x440(%rdx), %ymm4
vmovdqa 0x460(%rdx), %ymm5
vmovdqa 0x220(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x40(%rdi), %ymm8
vmovdqa 0x60(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x40(%rdi)
vmovdqa %ymm9, 0x60(%rdi)
vmovdqa 0x480(%rsi), %ymm2
vmovdqa 0x4a0(%rsi), %ymm3
vmovdqa 0x480(%rdx), %ymm4
vmovdqa 0x4a0(%rdx), %ymm5
vmovdqa 0x240(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x80(%rdi), %ymm8
vmovdqa 0xa0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x80(%rdi)
vmovdqa %ymm9, 0xa0(%rdi)
vmovdqa 0x4c0(%rsi), %ymm2
vmovdqa 0x4e0(%rsi), %ymm3
vmovdqa 0x4c0(%rdx), %ymm4
vmovdqa 0x4e0(%rdx), %ymm5
vmovdqa 0x260(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0xc0(%rdi), %ymm8
vmovdqa 0xe0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0xc0(%rdi)
vmovdqa %ymm9, 0xe0(%rdi)
vmovdqa 0x500(%rsi), %ymm2
vmovdqa 0x520(%rsi), %ymm3
vmovdqa 0x500(%rdx), %ymm4
vmovdqa 0x520(%rdx), %ymm5
vmovdqa 0x280(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x100(%rdi), %ymm8
vmovdqa 0x120(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa 0x540(%rsi), %ymm2
vmovdqa 0x560(%rsi), %ymm3
vmovdqa 0x540(%rdx), %ymm4
vmovdqa 0x560(%rdx), %ymm5
vmovdqa 0x2a0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x140(%rdi), %ymm8
vmovdqa 0x160(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x140(%rdi)
vmovdqa %ymm9, 0x160(%rdi)
vmovdqa 0x580(%rsi), %ymm2
vmovdqa 0x5a0(%rsi), %ymm3
vmovdqa 0x580(%rdx), %ymm4
vmovdqa 0x5a0(%rdx), %ymm5
vmovdqa 0x2c0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x180(%rdi), %ymm8
vmovdqa 0x1a0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x180(%rdi)
vmovdqa %ymm9, 0x1a0(%rdi)
vmovdqa 0x5c0(%rsi), %ymm2
vmovdqa 0x5e0(%rsi), %ymm3
vmovdqa 0x5c0(%rdx), %ymm4
vmovdqa 0x5e0(%rdx), %ymm5
vmovdqa 0x2e0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x1c0(%rdi), %ymm8
vmovdqa 0x1e0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x1c0(%rdi)
vmovdqa %ymm9, 0x1e0(%rdi)
retq
.cfi_endproc

View File

@@ -0,0 +1,985 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/polyvec_basemul_acc_montgomery_cached_asm_k4.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k4)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k4)
S2N_BN_SYMBOL(mlkem_polyvec_basemul_acc_montgomery_cached_asm_k4):
.cfi_startproc
vmovdqa (%r8), %ymm0
vmovdqa 0x20(%r8), %ymm1
vmovdqa (%rsi), %ymm2
vmovdqa 0x20(%rsi), %ymm3
vmovdqa (%rdx), %ymm4
vmovdqa 0x20(%rdx), %ymm5
vmovdqa (%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, (%rdi)
vmovdqa %ymm9, 0x20(%rdi)
vmovdqa 0x40(%rsi), %ymm2
vmovdqa 0x60(%rsi), %ymm3
vmovdqa 0x40(%rdx), %ymm4
vmovdqa 0x60(%rdx), %ymm5
vmovdqa 0x20(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x40(%rdi)
vmovdqa %ymm9, 0x60(%rdi)
vmovdqa 0x80(%rsi), %ymm2
vmovdqa 0xa0(%rsi), %ymm3
vmovdqa 0x80(%rdx), %ymm4
vmovdqa 0xa0(%rdx), %ymm5
vmovdqa 0x40(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x80(%rdi)
vmovdqa %ymm9, 0xa0(%rdi)
vmovdqa 0xc0(%rsi), %ymm2
vmovdqa 0xe0(%rsi), %ymm3
vmovdqa 0xc0(%rdx), %ymm4
vmovdqa 0xe0(%rdx), %ymm5
vmovdqa 0x60(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0xc0(%rdi)
vmovdqa %ymm9, 0xe0(%rdi)
vmovdqa 0x100(%rsi), %ymm2
vmovdqa 0x120(%rsi), %ymm3
vmovdqa 0x100(%rdx), %ymm4
vmovdqa 0x120(%rdx), %ymm5
vmovdqa 0x80(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa 0x140(%rsi), %ymm2
vmovdqa 0x160(%rsi), %ymm3
vmovdqa 0x140(%rdx), %ymm4
vmovdqa 0x160(%rdx), %ymm5
vmovdqa 0xa0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x140(%rdi)
vmovdqa %ymm9, 0x160(%rdi)
vmovdqa 0x180(%rsi), %ymm2
vmovdqa 0x1a0(%rsi), %ymm3
vmovdqa 0x180(%rdx), %ymm4
vmovdqa 0x1a0(%rdx), %ymm5
vmovdqa 0xc0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x180(%rdi)
vmovdqa %ymm9, 0x1a0(%rdi)
vmovdqa 0x1c0(%rsi), %ymm2
vmovdqa 0x1e0(%rsi), %ymm3
vmovdqa 0x1c0(%rdx), %ymm4
vmovdqa 0x1e0(%rdx), %ymm5
vmovdqa 0xe0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x1c0(%rdi)
vmovdqa %ymm9, 0x1e0(%rdi)
vmovdqa 0x200(%rsi), %ymm2
vmovdqa 0x220(%rsi), %ymm3
vmovdqa 0x200(%rdx), %ymm4
vmovdqa 0x220(%rdx), %ymm5
vmovdqa 0x100(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa (%rdi), %ymm8
vmovdqa 0x20(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, (%rdi)
vmovdqa %ymm9, 0x20(%rdi)
vmovdqa 0x240(%rsi), %ymm2
vmovdqa 0x260(%rsi), %ymm3
vmovdqa 0x240(%rdx), %ymm4
vmovdqa 0x260(%rdx), %ymm5
vmovdqa 0x120(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x40(%rdi), %ymm8
vmovdqa 0x60(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x40(%rdi)
vmovdqa %ymm9, 0x60(%rdi)
vmovdqa 0x280(%rsi), %ymm2
vmovdqa 0x2a0(%rsi), %ymm3
vmovdqa 0x280(%rdx), %ymm4
vmovdqa 0x2a0(%rdx), %ymm5
vmovdqa 0x140(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x80(%rdi), %ymm8
vmovdqa 0xa0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x80(%rdi)
vmovdqa %ymm9, 0xa0(%rdi)
vmovdqa 0x2c0(%rsi), %ymm2
vmovdqa 0x2e0(%rsi), %ymm3
vmovdqa 0x2c0(%rdx), %ymm4
vmovdqa 0x2e0(%rdx), %ymm5
vmovdqa 0x160(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0xc0(%rdi), %ymm8
vmovdqa 0xe0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0xc0(%rdi)
vmovdqa %ymm9, 0xe0(%rdi)
vmovdqa 0x300(%rsi), %ymm2
vmovdqa 0x320(%rsi), %ymm3
vmovdqa 0x300(%rdx), %ymm4
vmovdqa 0x320(%rdx), %ymm5
vmovdqa 0x180(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x100(%rdi), %ymm8
vmovdqa 0x120(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa 0x340(%rsi), %ymm2
vmovdqa 0x360(%rsi), %ymm3
vmovdqa 0x340(%rdx), %ymm4
vmovdqa 0x360(%rdx), %ymm5
vmovdqa 0x1a0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x140(%rdi), %ymm8
vmovdqa 0x160(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x140(%rdi)
vmovdqa %ymm9, 0x160(%rdi)
vmovdqa 0x380(%rsi), %ymm2
vmovdqa 0x3a0(%rsi), %ymm3
vmovdqa 0x380(%rdx), %ymm4
vmovdqa 0x3a0(%rdx), %ymm5
vmovdqa 0x1c0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x180(%rdi), %ymm8
vmovdqa 0x1a0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x180(%rdi)
vmovdqa %ymm9, 0x1a0(%rdi)
vmovdqa 0x3c0(%rsi), %ymm2
vmovdqa 0x3e0(%rsi), %ymm3
vmovdqa 0x3c0(%rdx), %ymm4
vmovdqa 0x3e0(%rdx), %ymm5
vmovdqa 0x1e0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x1c0(%rdi), %ymm8
vmovdqa 0x1e0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x1c0(%rdi)
vmovdqa %ymm9, 0x1e0(%rdi)
vmovdqa 0x400(%rsi), %ymm2
vmovdqa 0x420(%rsi), %ymm3
vmovdqa 0x400(%rdx), %ymm4
vmovdqa 0x420(%rdx), %ymm5
vmovdqa 0x200(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa (%rdi), %ymm8
vmovdqa 0x20(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, (%rdi)
vmovdqa %ymm9, 0x20(%rdi)
vmovdqa 0x440(%rsi), %ymm2
vmovdqa 0x460(%rsi), %ymm3
vmovdqa 0x440(%rdx), %ymm4
vmovdqa 0x460(%rdx), %ymm5
vmovdqa 0x220(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x40(%rdi), %ymm8
vmovdqa 0x60(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x40(%rdi)
vmovdqa %ymm9, 0x60(%rdi)
vmovdqa 0x480(%rsi), %ymm2
vmovdqa 0x4a0(%rsi), %ymm3
vmovdqa 0x480(%rdx), %ymm4
vmovdqa 0x4a0(%rdx), %ymm5
vmovdqa 0x240(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x80(%rdi), %ymm8
vmovdqa 0xa0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x80(%rdi)
vmovdqa %ymm9, 0xa0(%rdi)
vmovdqa 0x4c0(%rsi), %ymm2
vmovdqa 0x4e0(%rsi), %ymm3
vmovdqa 0x4c0(%rdx), %ymm4
vmovdqa 0x4e0(%rdx), %ymm5
vmovdqa 0x260(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0xc0(%rdi), %ymm8
vmovdqa 0xe0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0xc0(%rdi)
vmovdqa %ymm9, 0xe0(%rdi)
vmovdqa 0x500(%rsi), %ymm2
vmovdqa 0x520(%rsi), %ymm3
vmovdqa 0x500(%rdx), %ymm4
vmovdqa 0x520(%rdx), %ymm5
vmovdqa 0x280(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x100(%rdi), %ymm8
vmovdqa 0x120(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa 0x540(%rsi), %ymm2
vmovdqa 0x560(%rsi), %ymm3
vmovdqa 0x540(%rdx), %ymm4
vmovdqa 0x560(%rdx), %ymm5
vmovdqa 0x2a0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x140(%rdi), %ymm8
vmovdqa 0x160(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x140(%rdi)
vmovdqa %ymm9, 0x160(%rdi)
vmovdqa 0x580(%rsi), %ymm2
vmovdqa 0x5a0(%rsi), %ymm3
vmovdqa 0x580(%rdx), %ymm4
vmovdqa 0x5a0(%rdx), %ymm5
vmovdqa 0x2c0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x180(%rdi), %ymm8
vmovdqa 0x1a0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x180(%rdi)
vmovdqa %ymm9, 0x1a0(%rdi)
vmovdqa 0x5c0(%rsi), %ymm2
vmovdqa 0x5e0(%rsi), %ymm3
vmovdqa 0x5c0(%rdx), %ymm4
vmovdqa 0x5e0(%rdx), %ymm5
vmovdqa 0x2e0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x1c0(%rdi), %ymm8
vmovdqa 0x1e0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x1c0(%rdi)
vmovdqa %ymm9, 0x1e0(%rdi)
vmovdqa 0x600(%rsi), %ymm2
vmovdqa 0x620(%rsi), %ymm3
vmovdqa 0x600(%rdx), %ymm4
vmovdqa 0x620(%rdx), %ymm5
vmovdqa 0x300(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa (%rdi), %ymm8
vmovdqa 0x20(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, (%rdi)
vmovdqa %ymm9, 0x20(%rdi)
vmovdqa 0x640(%rsi), %ymm2
vmovdqa 0x660(%rsi), %ymm3
vmovdqa 0x640(%rdx), %ymm4
vmovdqa 0x660(%rdx), %ymm5
vmovdqa 0x320(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x40(%rdi), %ymm8
vmovdqa 0x60(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x40(%rdi)
vmovdqa %ymm9, 0x60(%rdi)
vmovdqa 0x680(%rsi), %ymm2
vmovdqa 0x6a0(%rsi), %ymm3
vmovdqa 0x680(%rdx), %ymm4
vmovdqa 0x6a0(%rdx), %ymm5
vmovdqa 0x340(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x80(%rdi), %ymm8
vmovdqa 0xa0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x80(%rdi)
vmovdqa %ymm9, 0xa0(%rdi)
vmovdqa 0x6c0(%rsi), %ymm2
vmovdqa 0x6e0(%rsi), %ymm3
vmovdqa 0x6c0(%rdx), %ymm4
vmovdqa 0x6e0(%rdx), %ymm5
vmovdqa 0x360(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0xc0(%rdi), %ymm8
vmovdqa 0xe0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0xc0(%rdi)
vmovdqa %ymm9, 0xe0(%rdi)
vmovdqa 0x700(%rsi), %ymm2
vmovdqa 0x720(%rsi), %ymm3
vmovdqa 0x700(%rdx), %ymm4
vmovdqa 0x720(%rdx), %ymm5
vmovdqa 0x380(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x100(%rdi), %ymm8
vmovdqa 0x120(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x100(%rdi)
vmovdqa %ymm9, 0x120(%rdi)
vmovdqa 0x740(%rsi), %ymm2
vmovdqa 0x760(%rsi), %ymm3
vmovdqa 0x740(%rdx), %ymm4
vmovdqa 0x760(%rdx), %ymm5
vmovdqa 0x3a0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x140(%rdi), %ymm8
vmovdqa 0x160(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x140(%rdi)
vmovdqa %ymm9, 0x160(%rdi)
vmovdqa 0x780(%rsi), %ymm2
vmovdqa 0x7a0(%rsi), %ymm3
vmovdqa 0x780(%rdx), %ymm4
vmovdqa 0x7a0(%rdx), %ymm5
vmovdqa 0x3c0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm8, %ymm13, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x180(%rdi), %ymm8
vmovdqa 0x1a0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x180(%rdi)
vmovdqa %ymm9, 0x1a0(%rdi)
vmovdqa 0x7c0(%rsi), %ymm2
vmovdqa 0x7e0(%rsi), %ymm3
vmovdqa 0x7c0(%rdx), %ymm4
vmovdqa 0x7e0(%rdx), %ymm5
vmovdqa 0x3e0(%rcx), %ymm6
vpmullw %ymm2, %ymm1, %ymm13
vpmullw %ymm3, %ymm1, %ymm14
vpmullw %ymm13, %ymm4, %ymm7
vpmullw %ymm13, %ymm5, %ymm9
vpmullw %ymm14, %ymm6, %ymm8
vpmullw %ymm14, %ymm4, %ymm10
vpmulhw %ymm7, %ymm0, %ymm7
vpmulhw %ymm9, %ymm0, %ymm9
vpmulhw %ymm8, %ymm0, %ymm8
vpmulhw %ymm10, %ymm0, %ymm10
vpmulhw %ymm2, %ymm4, %ymm11
vpmulhw %ymm2, %ymm5, %ymm12
vpmulhw %ymm3, %ymm6, %ymm13
vpmulhw %ymm3, %ymm4, %ymm14
vpsubw %ymm7, %ymm11, %ymm7
vpsubw %ymm9, %ymm12, %ymm9
vpsubw %ymm13, %ymm8, %ymm8
vpsubw %ymm10, %ymm14, %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa 0x1c0(%rdi), %ymm8
vmovdqa 0x1e0(%rdi), %ymm10
vpaddw %ymm7, %ymm8, %ymm7
vpaddw %ymm9, %ymm10, %ymm9
vmovdqa %ymm7, 0x1c0(%rdi)
vmovdqa %ymm9, 0x1e0(%rdi)
retq
.cfi_endproc

View File

@@ -0,0 +1,133 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Kyber optimized AVX2 implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/avx2
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2].
*
* Changes:
* - Add call to csub in reduce128_avx to produce outputs
* in [0,1,...,q-1] rather than [0,1,...,q], matching the
* semantics of mlk_poly_reduce().
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/reduce.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_reduce_avx2)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_reduce_avx2)
S2N_BN_SYMBOL(mlkem_reduce_avx2):
.cfi_startproc
vmovdqa (%rsi), %ymm0
vmovdqa 0x40(%rsi), %ymm1
callq Lreduce_avx2_core
addq $0x100, %rdi # imm = 0x100
callq Lreduce_avx2_core
retq
.cfi_endproc
Lreduce_avx2_core:
.cfi_startproc
vmovdqa (%rdi), %ymm2
vmovdqa 0x20(%rdi), %ymm3
vmovdqa 0x40(%rdi), %ymm4
vmovdqa 0x60(%rdi), %ymm5
vmovdqa 0x80(%rdi), %ymm6
vmovdqa 0xa0(%rdi), %ymm7
vmovdqa 0xc0(%rdi), %ymm8
vmovdqa 0xe0(%rdi), %ymm9
vpmulhw %ymm1, %ymm2, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm2, %ymm2
vpmulhw %ymm1, %ymm3, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm3, %ymm3
vpmulhw %ymm1, %ymm4, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm4, %ymm4
vpmulhw %ymm1, %ymm5, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm5, %ymm5
vpmulhw %ymm1, %ymm6, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm6, %ymm6
vpmulhw %ymm1, %ymm7, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm7, %ymm7
vpmulhw %ymm1, %ymm8, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm8, %ymm8
vpmulhw %ymm1, %ymm9, %ymm12
vpsraw $0xa, %ymm12, %ymm12
vpmullw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm9, %ymm9
vpsubw %ymm0, %ymm2, %ymm2
vpsraw $0xf, %ymm2, %ymm12
vpand %ymm0, %ymm12, %ymm12
vpaddw %ymm12, %ymm2, %ymm2
vpsubw %ymm0, %ymm3, %ymm3
vpsraw $0xf, %ymm3, %ymm12
vpand %ymm0, %ymm12, %ymm12
vpaddw %ymm12, %ymm3, %ymm3
vpsubw %ymm0, %ymm4, %ymm4
vpsraw $0xf, %ymm4, %ymm12
vpand %ymm0, %ymm12, %ymm12
vpaddw %ymm12, %ymm4, %ymm4
vpsubw %ymm0, %ymm5, %ymm5
vpsraw $0xf, %ymm5, %ymm12
vpand %ymm0, %ymm12, %ymm12
vpaddw %ymm12, %ymm5, %ymm5
vpsubw %ymm0, %ymm6, %ymm6
vpsraw $0xf, %ymm6, %ymm12
vpand %ymm0, %ymm12, %ymm12
vpaddw %ymm12, %ymm6, %ymm6
vpsubw %ymm0, %ymm7, %ymm7
vpsraw $0xf, %ymm7, %ymm12
vpand %ymm0, %ymm12, %ymm12
vpaddw %ymm12, %ymm7, %ymm7
vpsubw %ymm0, %ymm8, %ymm8
vpsraw $0xf, %ymm8, %ymm12
vpand %ymm0, %ymm12, %ymm12
vpaddw %ymm12, %ymm8, %ymm8
vpsubw %ymm0, %ymm9, %ymm9
vpsraw $0xf, %ymm9, %ymm12
vpand %ymm0, %ymm12, %ymm12
vpaddw %ymm12, %ymm9, %ymm9
vmovdqa %ymm2, (%rdi)
vmovdqa %ymm3, 0x20(%rdi)
vmovdqa %ymm4, 0x40(%rdi)
vmovdqa %ymm5, 0x60(%rdi)
vmovdqa %ymm6, 0x80(%rdi)
vmovdqa %ymm7, 0xa0(%rdi)
vmovdqa %ymm8, 0xc0(%rdi)
vmovdqa %ymm9, 0xe0(%rdi)
retq
.cfi_endproc

View File

@@ -0,0 +1,89 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*************************************************
* Name: mlk_rej_uniform_asm
*
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
*
* Arguments: - int16_t *r: pointer to output buffer of MLKEM_N
* 16-bit coefficients.
* - const uint8_t *buf: pointer to input buffer
* (assumed to be uniform random bytes)
* - unsigned buflen: length of input buffer in bytes.
* Must be a multiple of 12.
*
* Returns number of sampled 16-bit integers (at most MLKEM_N).
**************************************************/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/rej_uniform_asm.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_rej_uniform_asm)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_rej_uniform_asm)
S2N_BN_SYMBOL(mlkem_rej_uniform_asm):
.cfi_startproc
subq $0x210, %rsp # imm = 0x210
.cfi_adjust_cfa_offset 0x210
movabsq $0xd010d010d010d01, %rax # imm = 0xD010D010D010D01
movq %rax, %xmm0
pinsrq $0x1, %rax, %xmm0
movabsq $0xfff0fff0fff0fff, %rax # imm = 0xFFF0FFF0FFF0FFF
movq %rax, %xmm5
pinsrq $0x1, %rax, %xmm5
movabsq $0x504040302010100, %rax # imm = 0x504040302010100
movq %rax, %xmm4
movabsq $0xb0a0a0908070706, %rax # imm = 0xB0A0A0908070706
pinsrq $0x1, %rax, %xmm4
movq $0x0, %rax
movq $0x0, %r8
movq $0x5555, %r9 # imm = 0x5555
Lrej_uniform_asm_loop_start:
movdqu (%rsi,%r8), %xmm2
pshufb %xmm4, %xmm2
movdqa %xmm2, %xmm6
psrlw $0x4, %xmm6
pblendw $0xaa, %xmm6, %xmm2 # xmm2 = xmm2[0],xmm6[1],xmm2[2],xmm6[3],xmm2[4],xmm6[5],xmm2[6],xmm6[7]
pand %xmm5, %xmm2
movdqa %xmm0, %xmm1
pcmpgtw %xmm2, %xmm1
pmovmskb %xmm1, %r11d
pextq %r9, %r11, %r11
movq %r11, %r10
shlq $0x4, %r10
movdqu (%rcx,%r10), %xmm3
pshufb %xmm3, %xmm2
movdqu %xmm2, (%rsp,%rax,2)
popcntq %r11, %r11
addq %r11, %rax
cmpq $0x100, %rax # imm = 0x100
jae Lrej_uniform_asm_final_copy
addq $0xc, %r8
cmpq %r8, %rdx
ja Lrej_uniform_asm_loop_start
Lrej_uniform_asm_final_copy:
movq $0x100, %rcx # imm = 0x100
cmpq $0x100, %rax # imm = 0x100
cmovaq %rcx, %rax
movq %rsp, %rsi
movq %rax, %rcx
shlq %rcx
rep movsb (%rsi), %es:(%rdi)
addq $0x210, %rsp # imm = 0x210
.cfi_adjust_cfa_offset -0x210
retq
.cfi_endproc

View File

@@ -0,0 +1,545 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*
* WARNING: This file is auto-generated from scripts/autogen
* in the mlkem-native repository.
* Do not modify it directly.
*/
#include "../../../common.h"
#if defined(MLK_ARITH_BACKEND_X86_64_DEFAULT) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include <stdint.h>
#include "arith_native_x86_64.h"
/*
* Lookup table used by rejection sampling of the public matrix.
* See autogen for details.
*/
MLK_ALIGN const uint8_t mlk_rej_uniform_table[] = {
255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 0 */,
0, 1, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 1 */,
2, 3, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 2 */,
0, 1, 2, 3, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 3 */,
4, 5, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 4 */,
0, 1, 4, 5, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 5 */,
2, 3, 4, 5, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 6 */,
0, 1, 2, 3, 4, 5, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 7 */,
6, 7, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 8 */,
0, 1, 6, 7, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 9 */,
2, 3, 6, 7, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 10 */,
0, 1, 2, 3, 6, 7, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 11 */,
4, 5, 6, 7, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 12 */,
0, 1, 4, 5, 6, 7, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 13 */,
2, 3, 4, 5, 6, 7, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 14 */,
0, 1, 2, 3, 4, 5, 6, 7,
255, 255, 255, 255, 255, 255, 255, 255 /* 15 */,
8, 9, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 16 */,
0, 1, 8, 9, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 17 */,
2, 3, 8, 9, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 18 */,
0, 1, 2, 3, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 19 */,
4, 5, 8, 9, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 20 */,
0, 1, 4, 5, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 21 */,
2, 3, 4, 5, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 22 */,
0, 1, 2, 3, 4, 5, 8, 9,
255, 255, 255, 255, 255, 255, 255, 255 /* 23 */,
6, 7, 8, 9, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 24 */,
0, 1, 6, 7, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 25 */,
2, 3, 6, 7, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 26 */,
0, 1, 2, 3, 6, 7, 8, 9,
255, 255, 255, 255, 255, 255, 255, 255 /* 27 */,
4, 5, 6, 7, 8, 9, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 28 */,
0, 1, 4, 5, 6, 7, 8, 9,
255, 255, 255, 255, 255, 255, 255, 255 /* 29 */,
2, 3, 4, 5, 6, 7, 8, 9,
255, 255, 255, 255, 255, 255, 255, 255 /* 30 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 255, 255, 255, 255, 255, 255 /* 31 */,
10, 11, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 32 */,
0, 1, 10, 11, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 33 */,
2, 3, 10, 11, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 34 */,
0, 1, 2, 3, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 35 */,
4, 5, 10, 11, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 36 */,
0, 1, 4, 5, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 37 */,
2, 3, 4, 5, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 38 */,
0, 1, 2, 3, 4, 5, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 39 */,
6, 7, 10, 11, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 40 */,
0, 1, 6, 7, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 41 */,
2, 3, 6, 7, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 42 */,
0, 1, 2, 3, 6, 7, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 43 */,
4, 5, 6, 7, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 44 */,
0, 1, 4, 5, 6, 7, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 45 */,
2, 3, 4, 5, 6, 7, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 46 */,
0, 1, 2, 3, 4, 5, 6, 7,
10, 11, 255, 255, 255, 255, 255, 255 /* 47 */,
8, 9, 10, 11, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 48 */,
0, 1, 8, 9, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 49 */,
2, 3, 8, 9, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 50 */,
0, 1, 2, 3, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 51 */,
4, 5, 8, 9, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 52 */,
0, 1, 4, 5, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 53 */,
2, 3, 4, 5, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 54 */,
0, 1, 2, 3, 4, 5, 8, 9,
10, 11, 255, 255, 255, 255, 255, 255 /* 55 */,
6, 7, 8, 9, 10, 11, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 56 */,
0, 1, 6, 7, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 57 */,
2, 3, 6, 7, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 58 */,
0, 1, 2, 3, 6, 7, 8, 9,
10, 11, 255, 255, 255, 255, 255, 255 /* 59 */,
4, 5, 6, 7, 8, 9, 10, 11,
255, 255, 255, 255, 255, 255, 255, 255 /* 60 */,
0, 1, 4, 5, 6, 7, 8, 9,
10, 11, 255, 255, 255, 255, 255, 255 /* 61 */,
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 255, 255, 255, 255, 255, 255 /* 62 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 255, 255, 255, 255 /* 63 */,
12, 13, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 64 */,
0, 1, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 65 */,
2, 3, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 66 */,
0, 1, 2, 3, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 67 */,
4, 5, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 68 */,
0, 1, 4, 5, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 69 */,
2, 3, 4, 5, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 70 */,
0, 1, 2, 3, 4, 5, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 71 */,
6, 7, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 72 */,
0, 1, 6, 7, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 73 */,
2, 3, 6, 7, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 74 */,
0, 1, 2, 3, 6, 7, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 75 */,
4, 5, 6, 7, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 76 */,
0, 1, 4, 5, 6, 7, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 77 */,
2, 3, 4, 5, 6, 7, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 78 */,
0, 1, 2, 3, 4, 5, 6, 7,
12, 13, 255, 255, 255, 255, 255, 255 /* 79 */,
8, 9, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 80 */,
0, 1, 8, 9, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 81 */,
2, 3, 8, 9, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 82 */,
0, 1, 2, 3, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 83 */,
4, 5, 8, 9, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 84 */,
0, 1, 4, 5, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 85 */,
2, 3, 4, 5, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 86 */,
0, 1, 2, 3, 4, 5, 8, 9,
12, 13, 255, 255, 255, 255, 255, 255 /* 87 */,
6, 7, 8, 9, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 88 */,
0, 1, 6, 7, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 89 */,
2, 3, 6, 7, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 90 */,
0, 1, 2, 3, 6, 7, 8, 9,
12, 13, 255, 255, 255, 255, 255, 255 /* 91 */,
4, 5, 6, 7, 8, 9, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 92 */,
0, 1, 4, 5, 6, 7, 8, 9,
12, 13, 255, 255, 255, 255, 255, 255 /* 93 */,
2, 3, 4, 5, 6, 7, 8, 9,
12, 13, 255, 255, 255, 255, 255, 255 /* 94 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 12, 13, 255, 255, 255, 255 /* 95 */,
10, 11, 12, 13, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 96 */,
0, 1, 10, 11, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 97 */,
2, 3, 10, 11, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 98 */,
0, 1, 2, 3, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 99 */,
4, 5, 10, 11, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 100 */,
0, 1, 4, 5, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 101 */,
2, 3, 4, 5, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 102 */,
0, 1, 2, 3, 4, 5, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 103 */,
6, 7, 10, 11, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 104 */,
0, 1, 6, 7, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 105 */,
2, 3, 6, 7, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 106 */,
0, 1, 2, 3, 6, 7, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 107 */,
4, 5, 6, 7, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 108 */,
0, 1, 4, 5, 6, 7, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 109 */,
2, 3, 4, 5, 6, 7, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 110 */,
0, 1, 2, 3, 4, 5, 6, 7,
10, 11, 12, 13, 255, 255, 255, 255 /* 111 */,
8, 9, 10, 11, 12, 13, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 112 */,
0, 1, 8, 9, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 113 */,
2, 3, 8, 9, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 114 */,
0, 1, 2, 3, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 115 */,
4, 5, 8, 9, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 116 */,
0, 1, 4, 5, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 117 */,
2, 3, 4, 5, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 118 */,
0, 1, 2, 3, 4, 5, 8, 9,
10, 11, 12, 13, 255, 255, 255, 255 /* 119 */,
6, 7, 8, 9, 10, 11, 12, 13,
255, 255, 255, 255, 255, 255, 255, 255 /* 120 */,
0, 1, 6, 7, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 121 */,
2, 3, 6, 7, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 122 */,
0, 1, 2, 3, 6, 7, 8, 9,
10, 11, 12, 13, 255, 255, 255, 255 /* 123 */,
4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 255, 255, 255, 255, 255, 255 /* 124 */,
0, 1, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 255, 255, 255, 255 /* 125 */,
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 255, 255, 255, 255 /* 126 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 255, 255 /* 127 */,
14, 15, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 128 */,
0, 1, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 129 */,
2, 3, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 130 */,
0, 1, 2, 3, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 131 */,
4, 5, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 132 */,
0, 1, 4, 5, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 133 */,
2, 3, 4, 5, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 134 */,
0, 1, 2, 3, 4, 5, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 135 */,
6, 7, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 136 */,
0, 1, 6, 7, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 137 */,
2, 3, 6, 7, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 138 */,
0, 1, 2, 3, 6, 7, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 139 */,
4, 5, 6, 7, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 140 */,
0, 1, 4, 5, 6, 7, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 141 */,
2, 3, 4, 5, 6, 7, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 142 */,
0, 1, 2, 3, 4, 5, 6, 7,
14, 15, 255, 255, 255, 255, 255, 255 /* 143 */,
8, 9, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 144 */,
0, 1, 8, 9, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 145 */,
2, 3, 8, 9, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 146 */,
0, 1, 2, 3, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 147 */,
4, 5, 8, 9, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 148 */,
0, 1, 4, 5, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 149 */,
2, 3, 4, 5, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 150 */,
0, 1, 2, 3, 4, 5, 8, 9,
14, 15, 255, 255, 255, 255, 255, 255 /* 151 */,
6, 7, 8, 9, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 152 */,
0, 1, 6, 7, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 153 */,
2, 3, 6, 7, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 154 */,
0, 1, 2, 3, 6, 7, 8, 9,
14, 15, 255, 255, 255, 255, 255, 255 /* 155 */,
4, 5, 6, 7, 8, 9, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 156 */,
0, 1, 4, 5, 6, 7, 8, 9,
14, 15, 255, 255, 255, 255, 255, 255 /* 157 */,
2, 3, 4, 5, 6, 7, 8, 9,
14, 15, 255, 255, 255, 255, 255, 255 /* 158 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 14, 15, 255, 255, 255, 255 /* 159 */,
10, 11, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 160 */,
0, 1, 10, 11, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 161 */,
2, 3, 10, 11, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 162 */,
0, 1, 2, 3, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 163 */,
4, 5, 10, 11, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 164 */,
0, 1, 4, 5, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 165 */,
2, 3, 4, 5, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 166 */,
0, 1, 2, 3, 4, 5, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 167 */,
6, 7, 10, 11, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 168 */,
0, 1, 6, 7, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 169 */,
2, 3, 6, 7, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 170 */,
0, 1, 2, 3, 6, 7, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 171 */,
4, 5, 6, 7, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 172 */,
0, 1, 4, 5, 6, 7, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 173 */,
2, 3, 4, 5, 6, 7, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 174 */,
0, 1, 2, 3, 4, 5, 6, 7,
10, 11, 14, 15, 255, 255, 255, 255 /* 175 */,
8, 9, 10, 11, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 176 */,
0, 1, 8, 9, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 177 */,
2, 3, 8, 9, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 178 */,
0, 1, 2, 3, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 179 */,
4, 5, 8, 9, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 180 */,
0, 1, 4, 5, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 181 */,
2, 3, 4, 5, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 182 */,
0, 1, 2, 3, 4, 5, 8, 9,
10, 11, 14, 15, 255, 255, 255, 255 /* 183 */,
6, 7, 8, 9, 10, 11, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 184 */,
0, 1, 6, 7, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 185 */,
2, 3, 6, 7, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 186 */,
0, 1, 2, 3, 6, 7, 8, 9,
10, 11, 14, 15, 255, 255, 255, 255 /* 187 */,
4, 5, 6, 7, 8, 9, 10, 11,
14, 15, 255, 255, 255, 255, 255, 255 /* 188 */,
0, 1, 4, 5, 6, 7, 8, 9,
10, 11, 14, 15, 255, 255, 255, 255 /* 189 */,
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 14, 15, 255, 255, 255, 255 /* 190 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 14, 15, 255, 255 /* 191 */,
12, 13, 14, 15, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 192 */,
0, 1, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 193 */,
2, 3, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 194 */,
0, 1, 2, 3, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 195 */,
4, 5, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 196 */,
0, 1, 4, 5, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 197 */,
2, 3, 4, 5, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 198 */,
0, 1, 2, 3, 4, 5, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 199 */,
6, 7, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 200 */,
0, 1, 6, 7, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 201 */,
2, 3, 6, 7, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 202 */,
0, 1, 2, 3, 6, 7, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 203 */,
4, 5, 6, 7, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 204 */,
0, 1, 4, 5, 6, 7, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 205 */,
2, 3, 4, 5, 6, 7, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 206 */,
0, 1, 2, 3, 4, 5, 6, 7,
12, 13, 14, 15, 255, 255, 255, 255 /* 207 */,
8, 9, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 208 */,
0, 1, 8, 9, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 209 */,
2, 3, 8, 9, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 210 */,
0, 1, 2, 3, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 211 */,
4, 5, 8, 9, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 212 */,
0, 1, 4, 5, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 213 */,
2, 3, 4, 5, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 214 */,
0, 1, 2, 3, 4, 5, 8, 9,
12, 13, 14, 15, 255, 255, 255, 255 /* 215 */,
6, 7, 8, 9, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 216 */,
0, 1, 6, 7, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 217 */,
2, 3, 6, 7, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 218 */,
0, 1, 2, 3, 6, 7, 8, 9,
12, 13, 14, 15, 255, 255, 255, 255 /* 219 */,
4, 5, 6, 7, 8, 9, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 220 */,
0, 1, 4, 5, 6, 7, 8, 9,
12, 13, 14, 15, 255, 255, 255, 255 /* 221 */,
2, 3, 4, 5, 6, 7, 8, 9,
12, 13, 14, 15, 255, 255, 255, 255 /* 222 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 12, 13, 14, 15, 255, 255 /* 223 */,
10, 11, 12, 13, 14, 15, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255 /* 224 */,
0, 1, 10, 11, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 225 */,
2, 3, 10, 11, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 226 */,
0, 1, 2, 3, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 227 */,
4, 5, 10, 11, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 228 */,
0, 1, 4, 5, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 229 */,
2, 3, 4, 5, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 230 */,
0, 1, 2, 3, 4, 5, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 231 */,
6, 7, 10, 11, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 232 */,
0, 1, 6, 7, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 233 */,
2, 3, 6, 7, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 234 */,
0, 1, 2, 3, 6, 7, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 235 */,
4, 5, 6, 7, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 236 */,
0, 1, 4, 5, 6, 7, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 237 */,
2, 3, 4, 5, 6, 7, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 238 */,
0, 1, 2, 3, 4, 5, 6, 7,
10, 11, 12, 13, 14, 15, 255, 255 /* 239 */,
8, 9, 10, 11, 12, 13, 14, 15,
255, 255, 255, 255, 255, 255, 255, 255 /* 240 */,
0, 1, 8, 9, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 241 */,
2, 3, 8, 9, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 242 */,
0, 1, 2, 3, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 243 */,
4, 5, 8, 9, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 244 */,
0, 1, 4, 5, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 245 */,
2, 3, 4, 5, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 246 */,
0, 1, 2, 3, 4, 5, 8, 9,
10, 11, 12, 13, 14, 15, 255, 255 /* 247 */,
6, 7, 8, 9, 10, 11, 12, 13,
14, 15, 255, 255, 255, 255, 255, 255 /* 248 */,
0, 1, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 249 */,
2, 3, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 250 */,
0, 1, 2, 3, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 255, 255 /* 251 */,
4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 255, 255, 255, 255 /* 252 */,
0, 1, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 255, 255 /* 253 */,
2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 255, 255 /* 254 */,
0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 12, 13, 14, 15 /* 255 */,
};
#else /* MLK_ARITH_BACKEND_X86_64_DEFAULT && !MLK_CONFIG_MULTILEVEL_NO_SHARED \
*/
MLK_EMPTY_CU(avx2_rej_uniform_table)
#endif /* !(MLK_ARITH_BACKEND_X86_64_DEFAULT && \
!MLK_CONFIG_MULTILEVEL_NO_SHARED) */

View File

@@ -0,0 +1,42 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*
* This file is derived from the public domain
* AVX2 Kyber implementation @[REF_AVX2].
*/
.macro shuffle8 r0,r1,r2,r3
vperm2i128 $0x20,%ymm\r1,%ymm\r0,%ymm\r2
vperm2i128 $0x31,%ymm\r1,%ymm\r0,%ymm\r3
.endm
.macro shuffle4 r0,r1,r2,r3
vpunpcklqdq %ymm\r1,%ymm\r0,%ymm\r2
vpunpckhqdq %ymm\r1,%ymm\r0,%ymm\r3
.endm
/* Shuffle r0=(a0,b0,c0,d0,...), r1=(a1,b1,c1,d1,...) into */
/* r2 = (a0,b0,a1,b1,e0,f0,e1,f1,...) */
/* r3 = (c0,d0,c1,d1,g0,h0,g1,h1,...) */
.macro shuffle2 r0,r1,r2,r3
/* r2=(a1,b1,a1,b1,e1,f1,e1,f1,...) */
vmovsldup %ymm\r1,%ymm\r2
/* Conditional move */
/* 0xAA = 0b10101010 */
/* r2=(a0,b0,a1,b1,e0,f0,e1,f1,...) */
vpblendd $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
/* r0=(c0,d0,0,0,g0,h0,0,0,...) */
vpsrlq $32,%ymm\r0,%ymm\r0
/* r3=(c0,d0,c1,d1,g0,h0,g1,h1,...) */
vpblendd $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm
.macro shuffle1 r0,r1,r2,r3
vpslld $16,%ymm\r1,%ymm\r2
vpblendw $0xAA,%ymm\r2,%ymm\r0,%ymm\r2
vpsrld $16,%ymm\r0,%ymm\r0
vpblendw $0xAA,%ymm\r1,%ymm\r0,%ymm\r3
.endm

View File

@@ -0,0 +1,100 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [REF_AVX2]
* CRYSTALS-Kyber optimized AVX2 implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/avx2
*/
/*
* Implementation from Kyber reference repository @[REF_AVX2]
*
* Changes:
* - Add call to csub in reduce128_avx to produce outputs
* in [0,1,...,q-1] rather than [0,1,...,q], matching the
* semantics of mlk_poly_reduce().
*/
#include "_internal_s2n_bignum.h"
/*
* WARNING: This file is auto-derived from the mlkem-native source file
* dev/x86_64/src/tomont.S using scripts/simpasm. Do not modify it directly.
*/
.text
.balign 4
S2N_BN_SYM_VISIBILITY_DIRECTIVE(mlkem_tomont_avx2)
S2N_BN_SYM_PRIVACY_DIRECTIVE(mlkem_tomont_avx2)
S2N_BN_SYMBOL(mlkem_tomont_avx2):
.cfi_startproc
vmovdqa (%rsi), %ymm0
vmovdqa 0xa0(%rsi), %ymm1
vmovdqa 0xc0(%rsi), %ymm2
callq Ltomont_avx2_core
addq $0x100, %rdi # imm = 0x100
callq Ltomont_avx2_core
retq
.cfi_endproc
Ltomont_avx2_core:
.cfi_startproc
vmovdqa (%rdi), %ymm3
vmovdqa 0x20(%rdi), %ymm4
vmovdqa 0x40(%rdi), %ymm5
vmovdqa 0x60(%rdi), %ymm6
vmovdqa 0x80(%rdi), %ymm7
vmovdqa 0xa0(%rdi), %ymm8
vmovdqa 0xc0(%rdi), %ymm9
vmovdqa 0xe0(%rdi), %ymm10
vpmullw %ymm1, %ymm3, %ymm11
vpmulhw %ymm2, %ymm3, %ymm3
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm11, %ymm3, %ymm3
vpmullw %ymm1, %ymm4, %ymm12
vpmulhw %ymm2, %ymm4, %ymm4
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm4, %ymm4
vpmullw %ymm1, %ymm5, %ymm13
vpmulhw %ymm2, %ymm5, %ymm5
vpmulhw %ymm0, %ymm13, %ymm13
vpsubw %ymm13, %ymm5, %ymm5
vpmullw %ymm1, %ymm6, %ymm14
vpmulhw %ymm2, %ymm6, %ymm6
vpmulhw %ymm0, %ymm14, %ymm14
vpsubw %ymm14, %ymm6, %ymm6
vpmullw %ymm1, %ymm7, %ymm15
vpmulhw %ymm2, %ymm7, %ymm7
vpmulhw %ymm0, %ymm15, %ymm15
vpsubw %ymm15, %ymm7, %ymm7
vpmullw %ymm1, %ymm8, %ymm11
vpmulhw %ymm2, %ymm8, %ymm8
vpmulhw %ymm0, %ymm11, %ymm11
vpsubw %ymm11, %ymm8, %ymm8
vpmullw %ymm1, %ymm9, %ymm12
vpmulhw %ymm2, %ymm9, %ymm9
vpmulhw %ymm0, %ymm12, %ymm12
vpsubw %ymm12, %ymm9, %ymm9
vpmullw %ymm1, %ymm10, %ymm13
vpmulhw %ymm2, %ymm10, %ymm10
vpmulhw %ymm0, %ymm13, %ymm13
vpsubw %ymm13, %ymm10, %ymm10
vmovdqa %ymm3, (%rdi)
vmovdqa %ymm4, 0x20(%rdi)
vmovdqa %ymm5, 0x40(%rdi)
vmovdqa %ymm6, 0x60(%rdi)
vmovdqa %ymm7, 0x80(%rdi)
vmovdqa %ymm8, 0xa0(%rdi)
vmovdqa %ymm9, 0xc0(%rdi)
vmovdqa %ymm10, 0xe0(%rdi)
retq
.cfi_endproc

View File

@@ -0,0 +1,26 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*
* WARNING: This file is auto-generated from scripts/autogen
* in the mlkem-native repository.
* Do not modify it directly.
*/
/*
* Table of twiddle values used in the AVX2 mulcache
* See autogen for details.
*/
- 1103,
555, -1251, 1550, 422, 177, -291, 1574, -246, 1159, -777, -602, -1590, -872, 418, -156, 430,
843, 871, 105, 587, -235, -460, 1653, 778, -147, 1483, 1119, 644, 349, 329, -75, 817, 603, 1322,
-1465, -1215, 1218, -874, -1187, -1185, -1278, -1510, -870, -108, 996, 958, 1522, 1097, 610,
-1285, 384, -136, -1335, 220, -1659, -1530, 794, -854, 478, -308, 991, -1460, 1628, -335,
-11477, -32227, 20494, -27738, 945, -14883, 6182, 32010, 10631, 29175, -28762, -18486, 17560,
-14430, -5276, 11182, 13387, -14233, -21655, 13131, -4587, 23092, 5493, -32502, 30317, -18741,
12639, 20100, 18525, 19529, -12619, -31183, 25435, -7382, 24391, -20927, 10946, 24214, 16989,
10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, 20297, 2146, 15355, -32384, -6280,
-14903, -11044, 14469, -21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132,

View File

@@ -0,0 +1,57 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*
* WARNING: This file is auto-generated from scripts/autogen
* in the mlkem-native repository.
* Do not modify it directly.
*/
/*
* Table of zeta values used in the AVX2 NTTs
* See autogen for details.
*/
31498, 31498, 31498, 31498, -758, -758, -758, -758, 0, 0, 0, 0, 0, 0, 0, 0,
14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745, 14745,
14745, 14745, 14745, 14745, 14745, -359, -359, -359, -359, -359, -359, -359,
-359, -359, -359, -359, -359, -359, -359, -359, -359, 13525, 13525, 13525,
13525, 13525, 13525, 13525, 13525, -12402, -12402, -12402, -12402, -12402,
-12402, -12402, -12402, 1493, 1493, 1493, 1493, 1493, 1493, 1493, 1493,
1422, 1422, 1422, 1422, 1422, 1422, 1422, 1422, -20907, -20907, -20907,
-20907, 27758, 27758, 27758, 27758, -3799, -3799, -3799, -3799, -15690,
-15690, -15690, -15690, -171, -171, -171, -171, 622, 622, 622, 622, 1577,
1577, 1577, 1577, 182, 182, 182, 182, -5827, -5827, 17363, 17363, -26360,
-26360, -29057, -29057, 5571, 5571, -1102, -1102, 21438, 21438, -26242,
-26242, 573, 573, -1325, -1325, 264, 264, 383, 383, -829, -829, 1458, 1458,
-1602, -1602, -130, -130, -5689, -6516, 1496, 30967, -23565, 20179, 20710,
25080, -12796, 26616, 16064, -12442, 9134, -650, -25986, 27837, 1223, 652,
-552, 1015, -1293, 1491, -282, -1544, 516, -8, -320, -666, -1618, -1162,
126, 1469, -335, -11477, -32227, 20494, -27738, 945, -14883, 6182, 32010,
10631, 29175, -28762, -18486, 17560, -14430, -5276, -1103, 555, -1251, 1550,
422, 177, -291, 1574, -246, 1159, -777, -602, -1590, -872, 418, -156, 11182,
13387, -14233, -21655, 13131, -4587, 23092, 5493, -32502, 30317, -18741,
12639, 20100, 18525, 19529, -12619, 430, 843, 871, 105, 587, -235, -460,
1653, 778, -147, 1483, 1119, 644, 349, 329, -75, 787, 787, 787, 787, 787,
787, 787, 787, 787, 787, 787, 787, 787, 787, 787, 787, -1517, -1517, -1517,
-1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517, -1517,
-1517, -1517, 28191, 28191, 28191, 28191, 28191, 28191, 28191, 28191,
-16694, -16694, -16694, -16694, -16694, -16694, -16694, -16694, 287, 287,
287, 287, 287, 287, 287, 287, 202, 202, 202, 202, 202, 202, 202, 202, 10690,
10690, 10690, 10690, 1358, 1358, 1358, 1358, -11202, -11202, -11202, -11202,
31164, 31164, 31164, 31164, 962, 962, 962, 962, -1202, -1202, -1202, -1202,
-1474, -1474, -1474, -1474, 1468, 1468, 1468, 1468, -28073, -28073, 24313,
24313, -10532, -10532, 8800, 8800, 18426, 18426, 8859, 8859, 26675, 26675,
-16163, -16163, -681, -681, 1017, 1017, 732, 732, 608, 608, -1542, -1542,
411, 411, -205, -205, -1571, -1571, 19883, -28250, -15887, -8898, -28309,
9075, -30199, 18249, 13426, 14017, -29156, -12757, 16832, 4311, -24155,
-17915, -853, -90, -271, 830, 107, -1421, -247, -951, -398, 961, -1508,
-725, 448, -1065, 677, -1275, -31183, 25435, -7382, 24391, -20927, 10946,
24214, 16989, 10335, -7934, -22502, 10906, 31636, 28644, 23998, -17422, 817,
603, 1322, -1465, -1215, 1218, -874, -1187, -1185, -1278, -1510, -870, -108,
996, 958, 1522, 20297, 2146, 15355, -32384, -6280, -14903, -11044, 14469,
-21498, -20198, 23210, -17442, -23860, -20257, 7756, 23132, 1097, 610,
-1285, 384, -136, -1335, 220, -1659, -1530, 794, -854, 478, -308, 991,
-1460, 1628,

View File

@@ -0,0 +1,82 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#ifndef MLK_PARAMS_H
#define MLK_PARAMS_H
#if defined(MLK_CONFIG_FILE)
#include MLK_CONFIG_FILE
#else
#include "config.h"
#endif
#if !defined(MLK_CONFIG_PARAMETER_SET)
#error MLK_CONFIG_PARAMETER_SET is not defined
#endif
#if MLK_CONFIG_PARAMETER_SET == 512
#define MLKEM_K 2
#elif MLK_CONFIG_PARAMETER_SET == 768
#define MLKEM_K 3
#elif MLK_CONFIG_PARAMETER_SET == 1024
#define MLKEM_K 4
#else
#error Invalid value for MLK_CONFIG_PARAMETER_SET. Must be 512, 768, or 1024.
#endif
#define MLKEM_N 256
#define MLKEM_Q 3329
#define MLKEM_Q_HALF ((MLKEM_Q + 1) / 2) /* 1665 */
#define MLKEM_UINT12_LIMIT 4096
#define MLKEM_SYMBYTES 32 /* size in bytes of hashes, and seeds */
#define MLKEM_SSBYTES 32 /* size in bytes of shared key */
#define MLKEM_POLYBYTES 384
#define MLKEM_POLYVECBYTES (MLKEM_K * MLKEM_POLYBYTES)
#define MLKEM_POLYCOMPRESSEDBYTES_D4 128
#define MLKEM_POLYCOMPRESSEDBYTES_D5 160
#define MLKEM_POLYCOMPRESSEDBYTES_D10 320
#define MLKEM_POLYCOMPRESSEDBYTES_D11 352
#if MLKEM_K == 2
#define MLKEM_ETA1 3
#define MLKEM_DU 10
#define MLKEM_DV 4
#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
#elif MLKEM_K == 3
#define MLKEM_ETA1 2
#define MLKEM_DU 10
#define MLKEM_DV 4
#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D4
#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D10
#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
#elif MLKEM_K == 4
#define MLKEM_ETA1 2
#define MLKEM_DU 11
#define MLKEM_DV 5
#define MLKEM_POLYCOMPRESSEDBYTES_DV MLKEM_POLYCOMPRESSEDBYTES_D5
#define MLKEM_POLYCOMPRESSEDBYTES_DU MLKEM_POLYCOMPRESSEDBYTES_D11
#define MLKEM_POLYVECCOMPRESSEDBYTES_DU (MLKEM_K * MLKEM_POLYCOMPRESSEDBYTES_DU)
#endif /* MLKEM_K == 4 */
#define MLKEM_ETA2 2
#define MLKEM_INDCPA_MSGBYTES (MLKEM_SYMBYTES)
#define MLKEM_INDCPA_PUBLICKEYBYTES (MLKEM_POLYVECBYTES + MLKEM_SYMBYTES)
#define MLKEM_INDCPA_SECRETKEYBYTES (MLKEM_POLYVECBYTES)
#define MLKEM_INDCPA_BYTES \
(MLKEM_POLYVECCOMPRESSEDBYTES_DU + MLKEM_POLYCOMPRESSEDBYTES_DV)
#define MLKEM_INDCCA_PUBLICKEYBYTES (MLKEM_INDCPA_PUBLICKEYBYTES)
/* 32 bytes of additional space to save H(pk) */
#define MLKEM_INDCCA_SECRETKEYBYTES \
(MLKEM_INDCPA_SECRETKEYBYTES + MLKEM_INDCPA_PUBLICKEYBYTES + \
2 * MLKEM_SYMBYTES)
#define MLKEM_INDCCA_CIPHERTEXTBYTES (MLKEM_INDCPA_BYTES)
#endif /* !MLK_PARAMS_H */

View File

@@ -0,0 +1,508 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [NeonNTT]
* Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
* Becker, Hwang, Kannwischer, Yang, Yang
* https://eprint.iacr.org/2021/986
*
* - [REF]
* CRYSTALS-Kyber C reference implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include <stdint.h>
#include <string.h>
#include "cbmc.h"
#include "debug.h"
#include "poly.h"
#include "sampling.h"
#include "symmetric.h"
#include "verify.h"
/*************************************************
* Name: mlk_fqmul
*
* Description: Montgomery multiplication modulo MLKEM_Q
*
* Arguments: - int16_t a: first factor
* Can be any int16_t.
* - int16_t b: second factor.
* Must be signed canonical (abs value <(MLKEM_Q+1)/2)
*
* Returns 16-bit integer congruent to a*b*R^{-1} mod MLKEM_Q, and
* smaller than MLKEM_Q in absolute value.
*
**************************************************/
/* Reference: `fqmul()` in the reference implementation @[REF]. */
static MLK_INLINE int16_t mlk_fqmul(int16_t a, int16_t b)
__contract__(
requires(b > -MLKEM_Q_HALF && b < MLKEM_Q_HALF)
ensures(return_value > -MLKEM_Q && return_value < MLKEM_Q)
)
{
int16_t res;
mlk_assert_abs_bound(&b, 1, MLKEM_Q_HALF);
res = mlk_montgomery_reduce((int32_t)a * (int32_t)b);
/* Bounds:
* |res| <= ceil(|a| * |b| / 2^16) + (MLKEM_Q + 1) / 2
* <= ceil(2^15 * ((MLKEM_Q - 1)/2) / 2^16) + (MLKEM_Q + 1) / 2
* <= ceil((MLKEM_Q - 1) / 4) + (MLKEM_Q + 1) / 2
* < MLKEM_Q
*/
mlk_assert_abs_bound(&res, 1, MLKEM_Q);
return res;
}
/*************************************************
* Name: mlk_barrett_reduce
*
* Description: Barrett reduction; given a 16-bit integer a, computes
* centered representative congruent to a mod q in
* {-(q-1)/2,...,(q-1)/2}
*
* Arguments: - int16_t a: input integer to be reduced
*
* Returns: integer in {-(q-1)/2,...,(q-1)/2} congruent to a modulo q.
*
**************************************************/
/* Reference: `barrett_reduce()` in the reference implementation @[REF]. */
static MLK_INLINE int16_t mlk_barrett_reduce(int16_t a)
__contract__(
ensures(return_value > -MLKEM_Q_HALF && return_value < MLKEM_Q_HALF)
)
{
/* Barrett reduction approximates
* ```
* round(a/MLKEM_Q)
* = round(a*(2^N/MLKEM_Q))/2^N)
* ~= round(a*round(2^N/MLKEM_Q)/2^N)
* ```
* Here, we pick N=26.
*/
const int32_t magic = 20159; /* check-magic: 20159 == round(2^26 / MLKEM_Q) */
/*
* PORTABILITY: Right-shift on a signed integer is
* implementation-defined for negative left argument.
* Here, we assume it's sign-preserving "arithmetic" shift right.
* See (C99 6.5.7 (5))
*/
const int32_t t = (magic * a + (1 << 25)) >> 26;
/*
* t is in -10 .. +10, so we need 32-bit math to
* evaluate t * MLKEM_Q and the subsequent subtraction
*/
int16_t res = (int16_t)(a - t * MLKEM_Q);
mlk_assert_abs_bound(&res, 1, MLKEM_Q_HALF);
return res;
}
/* Reference: `poly_tomont()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
void mlk_poly_tomont(mlk_poly *r)
{
unsigned i;
const int16_t f = 1353; /* check-magic: 1353 == signed_mod(2^32, MLKEM_Q) */
#if defined(MLK_USE_NATIVE_POLY_TOMONT)
int ret;
ret = mlk_poly_tomont_native(r->coeffs);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
return;
}
#endif /* MLK_USE_NATIVE_POLY_TOMONT */
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
invariant(array_abs_bound(r->coeffs, 0, i, MLKEM_Q)))
{
r->coeffs[i] = mlk_fqmul(r->coeffs[i], f);
}
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_Q);
}
/************************************************************
* Name: mlk_scalar_signed_to_unsigned_q
*
* Description: Constant-time conversion of signed representatives
* modulo MLKEM_Q within range (-(MLKEM_Q-1) .. (MLKEM_Q-1))
* into unsigned representatives within range (0..(MLKEM_Q-1)).
*
* Arguments: c: signed coefficient to be converted
*
************************************************************/
/* Reference: Not present in the reference implementation @[REF].
* - Used here to implement different semantics of `poly_reduce()`;
* see below. in the reference implementation @[REF], this logic is
* part of all compression functions (see `compress.c`). */
static MLK_INLINE uint16_t mlk_scalar_signed_to_unsigned_q(int16_t c)
__contract__(
requires(c > -MLKEM_Q && c < MLKEM_Q)
ensures(return_value >= 0 && return_value < MLKEM_Q)
ensures(return_value == (int32_t)c + (((int32_t)c < 0) * MLKEM_Q)))
{
mlk_assert_abs_bound(&c, 1, MLKEM_Q);
/* Add Q if c is negative, but in constant time */
c = mlk_ct_sel_int16(c + MLKEM_Q, c, mlk_ct_cmask_neg_i16(c));
/* and therefore cast to uint16_t is safe. */
mlk_assert_bound(&c, 1, 0, MLKEM_Q);
return (uint16_t)c;
}
/* Reference: `poly_reduce()` in the reference implementation @[REF]
* - We use _unsigned_ canonical outputs, while the reference
* implementation uses _signed_ canonical outputs.
* Accordingly, we need a conditional addition of MLKEM_Q
* here to go from signed to unsigned representatives.
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
MLK_INTERNAL_API
void mlk_poly_reduce(mlk_poly *r)
{
unsigned i;
#if defined(MLK_USE_NATIVE_POLY_REDUCE)
int ret;
ret = mlk_poly_reduce_native(r->coeffs);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
return;
}
#endif /* MLK_USE_NATIVE_POLY_REDUCE */
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
invariant(array_bound(r->coeffs, 0, i, 0, MLKEM_Q)))
{
/* Barrett reduction, giving signed canonical representative */
int16_t t = mlk_barrett_reduce(r->coeffs[i]);
/* Conditional addition to get unsigned canonical representative */
r->coeffs[i] = mlk_scalar_signed_to_unsigned_q(t);
}
mlk_assert_bound(r, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `poly_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
MLK_INTERNAL_API
void mlk_poly_add(mlk_poly *r, const mlk_poly *b)
{
unsigned i;
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] + b->coeffs[k1])))
{
r->coeffs[i] = r->coeffs[i] + b->coeffs[i];
}
}
/* Reference: `poly_sub()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
MLK_INTERNAL_API
void mlk_poly_sub(mlk_poly *r, const mlk_poly *b)
{
unsigned i;
for (i = 0; i < MLKEM_N; i++)
__loop__(
invariant(i <= MLKEM_N)
invariant(forall(k0, i, MLKEM_N, r->coeffs[k0] == loop_entry(*r).coeffs[k0]))
invariant(forall(k1, 0, i, r->coeffs[k1] == loop_entry(*r).coeffs[k1] - b->coeffs[k1])))
{
r->coeffs[i] = r->coeffs[i] - b->coeffs[i];
}
}
#include "zetas.inc"
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
MLK_INTERNAL_API
void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
{
unsigned i;
#if defined(MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE)
int ret;
ret = mlk_poly_mulcache_compute_native(x->coeffs, a->coeffs);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
return;
}
#endif /* MLK_USE_NATIVE_POLY_MULCACHE_COMPUTE */
for (i = 0; i < MLKEM_N / 4; i++)
__loop__(
invariant(i <= MLKEM_N / 4)
invariant(array_abs_bound(x->coeffs, 0, 2 * i, MLKEM_Q)))
{
x->coeffs[2 * i + 0] = mlk_fqmul(a->coeffs[4 * i + 1], mlk_zetas[64 + i]);
x->coeffs[2 * i + 1] = mlk_fqmul(a->coeffs[4 * i + 3], -mlk_zetas[64 + i]);
}
/*
* This bound is true for the C implementation, but not needed
* in the higher level bounds reasoning. It is thus omitted
* from the spec to not unnecessarily constrain native
* implementations, but checked here nonetheless.
*/
mlk_assert_abs_bound(x, MLKEM_N / 2, MLKEM_Q);
}
/*
* Computes a block CT butterflies with a fixed twiddle factor,
* using Montgomery multiplication.
* Parameters:
* - r: Pointer to base of polynomial (_not_ the base of butterfly block)
* - root: Twiddle factor to use for the butterfly. This must be in
* Montgomery form and signed canonical.
* - start: Offset to the beginning of the butterfly block
* - len: Index difference between coefficients subject to a butterfly
* - bound: Ghost variable describing coefficient bound: Prior to `start`,
* coefficients must be bound by `bound + MLKEM_Q`. Post `start`,
* they must be bound by `bound`.
* When this function returns, output coefficients in the index range
* [start, start+2*len) have bound bumped to `bound + MLKEM_Q`.
* Example:
* - start=8, len=4
* This would compute the following four butterflies
* 8 -- 12
* 9 -- 13
* 10 -- 14
* 11 -- 15
* - start=4, len=2
* This would compute the following two butterflies
* 4 -- 6
* 5 -- 7
*/
/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
static void mlk_ntt_butterfly_block(int16_t r[MLKEM_N], int16_t zeta,
unsigned start, unsigned len, int bound)
__contract__(
requires(start < MLKEM_N)
requires(1 <= len && len <= MLKEM_N / 2 && start + 2 * len <= MLKEM_N)
requires(0 <= bound && bound < INT16_MAX - MLKEM_Q)
requires(-MLKEM_Q_HALF < zeta && zeta < MLKEM_Q_HALF)
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(array_abs_bound(r, 0, start, bound + MLKEM_Q))
requires(array_abs_bound(r, start, MLKEM_N, bound))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
ensures(array_abs_bound(r, 0, start + 2*len, bound + MLKEM_Q))
ensures(array_abs_bound(r, start + 2 * len, MLKEM_N, bound)))
{
/* `bound` is a ghost variable only needed in the CBMC specification */
unsigned j;
((void)bound);
for (j = start; j < start + len; j++)
__loop__(
invariant(start <= j && j <= start + len)
/*
* Coefficients are updated in strided pairs, so the bounds for the
* intermediate states alternate twice between the old and new bound
*/
invariant(array_abs_bound(r, 0, j, bound + MLKEM_Q))
invariant(array_abs_bound(r, j, start + len, bound))
invariant(array_abs_bound(r, start + len, j + len, bound + MLKEM_Q))
invariant(array_abs_bound(r, j + len, MLKEM_N, bound)))
{
int16_t t;
t = mlk_fqmul(r[j + len], zeta);
r[j + len] = r[j] - t;
r[j] = r[j] + t;
}
}
/*
* Compute one layer of forward NTT
* Parameters:
* - r: Pointer to base of polynomial
* - layer: Variable indicating which layer is being applied.
*/
/* Reference: Embedded in `ntt()` in the reference implementation @[REF]. */
static void mlk_ntt_layer(int16_t r[MLKEM_N], unsigned layer)
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(1 <= layer && layer <= 7)
requires(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
ensures(array_abs_bound(r, 0, MLKEM_N, (layer + 1) * MLKEM_Q)))
{
unsigned start, k, len;
/* Twiddle factors for layer n are at indices 2^(n-1)..2^n-1. */
k = 1u << (layer - 1);
len = MLKEM_N >> layer;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
invariant(start < MLKEM_N + 2 * len)
invariant(k <= MLKEM_N / 2 && 2 * len * k == start + MLKEM_N)
invariant(array_abs_bound(r, 0, start, layer * MLKEM_Q + MLKEM_Q))
invariant(array_abs_bound(r, start, MLKEM_N, layer * MLKEM_Q)))
{
int16_t zeta = mlk_zetas[k++];
mlk_ntt_butterfly_block(r, zeta, start, len, layer * MLKEM_Q);
}
}
/*
* Compute full forward NTT
* NOTE: This particular implementation satisfies a much tighter
* bound on the output coefficients (5*q) than the contractual one (8*q),
* but this is not needed in the calling code. Should we change the
* base multiplication strategy to require smaller NTT output bounds,
* the proof may need strengthening.
*/
/* Reference: `ntt()` in the reference implementation @[REF].
* - Iterate over `layer` instead of `len` in the outer loop
* to simplify computation of zeta index. */
MLK_INTERNAL_API
void mlk_poly_ntt(mlk_poly *p)
{
unsigned layer;
int16_t *r;
mlk_assert_abs_bound(p, MLKEM_N, MLKEM_Q);
#if defined(MLK_USE_NATIVE_NTT)
{
int ret;
ret = mlk_ntt_native(p->coeffs);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
return;
}
}
#endif /* MLK_USE_NATIVE_NTT */
r = p->coeffs;
for (layer = 1; layer <= 7; layer++)
__loop__(
invariant(1 <= layer && layer <= 8)
invariant(array_abs_bound(r, 0, MLKEM_N, layer * MLKEM_Q)))
{
mlk_ntt_layer(r, layer);
}
/* Check the stronger bound */
mlk_assert_abs_bound(p, MLKEM_N, MLK_NTT_BOUND);
}
/* Compute one layer of inverse NTT */
/* Reference: Embedded into `invntt()` in the reference implementation @[REF] */
static void mlk_invntt_layer(int16_t *r, unsigned layer)
__contract__(
requires(memory_no_alias(r, sizeof(int16_t) * MLKEM_N))
requires(1 <= layer && layer <= 7)
requires(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * MLKEM_N))
ensures(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
unsigned start, k, len;
len = (MLKEM_N >> layer);
k = (1u << layer) - 1;
for (start = 0; start < MLKEM_N; start += 2 * len)
__loop__(
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q))
invariant(start <= MLKEM_N && k <= 127)
/* Normalised form of k == MLKEM_N / len - 1 - start / (2 * len) */
invariant(2 * len * k + start == 2 * MLKEM_N - 2 * len))
{
unsigned j;
int16_t zeta = mlk_zetas[k--];
for (j = start; j < start + len; j++)
__loop__(
invariant(start <= j && j <= start + len)
invariant(start <= MLKEM_N && k <= 127)
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
int16_t t = r[j];
r[j] = mlk_barrett_reduce(t + r[j + len]);
r[j + len] = r[j + len] - t;
r[j + len] = mlk_fqmul(r[j + len], zeta);
}
}
}
/* Reference: `invntt()` in the reference implementation @[REF]
* - We normalize at the beginning of the inverse NTT,
* while the reference implementation normalizes at
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
MLK_INTERNAL_API
void mlk_poly_invntt_tomont(mlk_poly *p)
{
unsigned j, layer;
const int16_t f = 1441; /* check-magic: 1441 == pow(2,32 - 7,MLKEM_Q) */
int16_t *r = p->coeffs;
#if defined(MLK_USE_NATIVE_INTT)
int ret;
ret = mlk_intt_native(p->coeffs);
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
return;
}
#endif /* MLK_USE_NATIVE_INTT */
/*
* Scale input polynomial to account for Montgomery factor
* and NTT twist. This also brings coefficients down to
* absolute value < MLKEM_Q.
*/
for (j = 0; j < MLKEM_N; j++)
__loop__(
invariant(j <= MLKEM_N)
invariant(array_abs_bound(r, 0, j, MLKEM_Q)))
{
r[j] = mlk_fqmul(r[j], f);
}
/* Run the invNTT layers */
for (layer = 7; layer > 0; layer--)
__loop__(
invariant(0 <= layer && layer < 8)
invariant(array_abs_bound(r, 0, MLKEM_N, MLKEM_Q)))
{
mlk_invntt_layer(r, layer);
}
mlk_assert_abs_bound(p, MLKEM_N, MLK_INVNTT_BOUND);
}
#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(mlk_poly)
#endif /* MLK_CONFIG_MULTILEVEL_NO_SHARED */

View File

@@ -0,0 +1,346 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*/
#ifndef MLK_POLY_H
#define MLK_POLY_H
#include <stddef.h>
#include <stdint.h>
#include "cbmc.h"
#include "common.h"
#include "debug.h"
#include "verify.h"
/* Absolute exclusive upper bound for the output of the inverse NTT */
#define MLK_INVNTT_BOUND (8 * MLKEM_Q)
/* Absolute exclusive upper bound for the output of the forward NTT */
#define MLK_NTT_BOUND (8 * MLKEM_Q)
/*
* Elements of R_q = Z_q[X]/(X^n + 1). Represents polynomial
* coeffs[0] + X*coeffs[1] + X^2*coeffs[2] + ... + X^{n-1}*coeffs[n-1]
*/
typedef struct
{
int16_t coeffs[MLKEM_N];
} MLK_ALIGN mlk_poly;
/*
* INTERNAL presentation of precomputed data speeding up
* the base multiplication of two polynomials in NTT domain.
*/
typedef struct
{
int16_t coeffs[MLKEM_N >> 1];
} MLK_ALIGN mlk_poly_mulcache;
/*************************************************
* Name: mlk_cast_uint16_to_int16
*
* Description: Cast uint16 value to int16
*
* Returns:
* input x in 0 .. 32767: returns value unchanged
* input x in 32768 .. 65535: returns (x - 65536)
**************************************************/
#ifdef CBMC
#pragma CPROVER check push
#pragma CPROVER check disable "conversion"
#endif
static MLK_ALWAYS_INLINE int16_t mlk_cast_uint16_to_int16(uint16_t x)
{
/*
* PORTABILITY: This relies on uint16_t -> int16_t
* being implemented as the inverse of int16_t -> uint16_t,
* which is implementation-defined (C99 6.3.1.3 (3))
* CBMC (correctly) fails to prove this conversion is OK,
* so we have to suppress that check here
*/
return (int16_t)x;
}
#ifdef CBMC
#pragma CPROVER check pop
#endif
/*************************************************
* Name: mlk_montgomery_reduce
*
* Description: Generic Montgomery reduction; given a 32-bit integer a, computes
* 16-bit integer congruent to a * R^-1 mod q, where R=2^16
*
* Arguments: - int32_t a: input integer to be reduced, of absolute value
* smaller or equal to INT32_MAX - 2^15 * MLKEM_Q.
*
* Returns: integer congruent to a * R^-1 modulo q, with absolute value
* <= ceil(|a| / 2^16) + (MLKEM_Q + 1)/2
*
**************************************************/
static MLK_ALWAYS_INLINE int16_t mlk_montgomery_reduce(int32_t a)
__contract__(
requires(a < +(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)) &&
a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)))
/* We don't attempt to express an input-dependent output bound
* as the post-condition here. There are two call-sites for this
* function:
* - The base multiplication: Here, we need no output bound.
* - mlk_fqmul: Here, we inline this function and prove another spec
* for mlk_fqmul which does have a post-condition bound. */
)
{
/* check-magic: 62209 == unsigned_mod(pow(MLKEM_Q, -1, 2^16), 2^16) */
const uint32_t QINV = 62209;
/* Compute a*q^{-1} mod 2^16 in unsigned representatives */
const uint16_t a_reduced = a & UINT16_MAX;
const uint16_t a_inverted = (a_reduced * QINV) & UINT16_MAX;
/* Lift to signed canonical representative mod 2^16. */
const int16_t t = mlk_cast_uint16_to_int16(a_inverted);
int32_t r;
mlk_assert(a < +(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)) &&
a > -(INT32_MAX - (((int32_t)1 << 15) * MLKEM_Q)));
r = a - ((int32_t)t * MLKEM_Q);
/*
* PORTABILITY: Right-shift on a signed integer is, strictly-speaking,
* implementation-defined for negative left argument. Here,
* we assume it's sign-preserving "arithmetic" shift right. (C99 6.5.7 (5))
*/
r = r >> 16;
/* Bounds: |r >> 16| <= ceil(|r| / 2^16)
* <= ceil(|a| / 2^16 + MLKEM_Q / 2)
* <= ceil(|a| / 2^16) + (MLKEM_Q + 1) / 2
*
* (Note that |a >> n| = ceil(|a| / 2^16) for negative a)
*/
return (int16_t)r;
}
#define mlk_poly_tomont MLK_NAMESPACE(poly_tomont)
/*************************************************
* Name: mlk_poly_tomont
*
* Description: Inplace conversion of all coefficients of a polynomial
* from normal domain to Montgomery domain
*
* Bounds: Output < q in absolute value.
*
* Arguments: - mlk_poly *r: pointer to input/output polynomial
*
* Specification: Internal normalization required in `mlk_indcpa_keypair_derand`
* as part of matrix-vector multiplication
* @[FIPS203, Algorithm 13, K-PKE.KeyGen, L18].
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_tomont(mlk_poly *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
);
#define mlk_poly_mulcache_compute MLK_NAMESPACE(poly_mulcache_compute)
/************************************************************
* Name: mlk_poly_mulcache_compute
*
* Description: Computes the mulcache for a polynomial in NTT domain
*
* The mulcache of a degree-2 polynomial b := b0 + b1*X
* in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
* computing products of b in Fq[X]/(X^2-zeta).
*
* The mulcache of a polynomial in NTT domain -- which is
* a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
* for varying zeta, is the 128-tuple of mulcaches of those
* polynomials.
*
* Arguments: - x: Pointer to mulcache to be populated
* - a: Pointer to input polynomial
*
* Specification:
* - Caches `b_1 * \gamma` in @[FIPS203, Algorithm 12, BaseCaseMultiply, L1]
*
************************************************************/
/*
* NOTE: The default C implementation of this function populates
* the mulcache with values in (-q,q), but this is not needed for the
* higher level safety proofs, and thus not part of the spec.
*/
MLK_INTERNAL_API
void mlk_poly_mulcache_compute(mlk_poly_mulcache *x, const mlk_poly *a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_poly_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_poly)))
assigns(object_whole(x))
);
#define mlk_poly_reduce MLK_NAMESPACE(poly_reduce)
/*************************************************
* Name: mlk_poly_reduce
*
* Description: Converts polynomial to _unsigned canonical_ representatives.
*
* The input coefficients can be arbitrary integers in int16_t.
* The output coefficients are in [0,1,...,MLKEM_Q-1].
*
* Arguments: - mlk_poly *r: pointer to input/output polynomial
*
* Specification: Normalizes on unsigned canoncial representatives
* ahead of calling @[FIPS203, Compress_d, Eq (4.7)].
* This is not made explicit in FIPS 203.
*
**************************************************/
/*
* NOTE: The semantics of mlk_poly_reduce() is different in
* the reference implementation, which requires
* signed canonical output data. Unsigned canonical
* outputs are better suited to the only remaining
* use of mlk_poly_reduce() in the context of (de)serialization.
*/
MLK_INTERNAL_API
void mlk_poly_reduce(mlk_poly *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
);
#define mlk_poly_add MLK_NAMESPACE(poly_add)
/************************************************************
* Name: mlk_poly_add
*
* Description: Adds two polynomials in place
*
* Arguments: - r: Pointer to input-output polynomial to be added to.
* - b: Pointer to input polynomial that should be added
* to r. Must be disjoint from r.
*
* The coefficients of r and b must be so that the addition does
* not overflow. Otherwise, the behaviour of this function is undefined.
*
* Specification:
* - @[FIPS203, 2.4.5, Arithmetic With Polynomials and NTT Representations]
* - Used in @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L21]
*
************************************************************/
/*
* NOTE: The reference implementation uses a 3-argument mlk_poly_add.
* We specialize to the accumulator form to avoid reasoning about aliasing.
*/
MLK_INTERNAL_API
void mlk_poly_add(mlk_poly *r, const mlk_poly *b)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(b, sizeof(mlk_poly)))
requires(forall(k0, 0, MLKEM_N, (int32_t) r->coeffs[k0] + b->coeffs[k0] <= INT16_MAX))
requires(forall(k1, 0, MLKEM_N, (int32_t) r->coeffs[k1] + b->coeffs[k1] >= INT16_MIN))
ensures(forall(k, 0, MLKEM_N, r->coeffs[k] == old(*r).coeffs[k] + b->coeffs[k]))
assigns(memory_slice(r, sizeof(mlk_poly)))
);
#define mlk_poly_sub MLK_NAMESPACE(poly_sub)
/*************************************************
* Name: mlk_poly_sub
*
* Description: Subtract two polynomials; no modular reduction is performed
*
* Arguments: - mlk_poly *r: Pointer to input-output polynomial to be added to.
* - const mlk_poly *b: Pointer to second input polynomial
*
* Specification:
* - @[FIPS203, 2.4.5, Arithmetic With Polynomials and NTT Representations]
* - Used in @[FIPS203, Algorithm 15, K-PKE.Decrypt, L6]
*
**************************************************/
/*
* NOTE: The reference implementation uses a 3-argument mlk_poly_sub.
* We specialize to the accumulator form to avoid reasoning about aliasing.
*/
MLK_INTERNAL_API
void mlk_poly_sub(mlk_poly *r, const mlk_poly *b)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(b, sizeof(mlk_poly)))
requires(forall(k0, 0, MLKEM_N, (int32_t) r->coeffs[k0] - b->coeffs[k0] <= INT16_MAX))
requires(forall(k1, 0, MLKEM_N, (int32_t) r->coeffs[k1] - b->coeffs[k1] >= INT16_MIN))
ensures(forall(k, 0, MLKEM_N, r->coeffs[k] == old(*r).coeffs[k] - b->coeffs[k]))
assigns(object_whole(r))
);
#define mlk_poly_ntt MLK_NAMESPACE(poly_ntt)
/*************************************************
* Name: mlk_poly_ntt
*
* Description: Computes negacyclic number-theoretic transform (NTT) of
* a polynomial in place.
*
* The input is assumed to be in normal order and
* coefficient-wise bound by MLKEM_Q in absolute value.
*
* The output polynomial is in bitreversed order, and
* coefficient-wise bound by MLK_NTT_BOUND in absolute value.
*
* (NOTE: Sometimes the input to the NTT is actually smaller,
* which gives better bounds.)
*
* Arguments: - mlk_poly *p: pointer to in/output polynomial
*
* Specification: Implements @[FIPS203, Algorithm 9, NTT]
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_ntt(mlk_poly *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_Q))
assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLK_NTT_BOUND))
);
#define mlk_poly_invntt_tomont MLK_NAMESPACE(poly_invntt_tomont)
/*************************************************
* Name: mlk_poly_invntt_tomont
*
* Description: Computes inverse of negacyclic number-theoretic transform (NTT)
* of a polynomial in place;
* inputs assumed to be in bitreversed order, output in normal
* order
*
* The input is assumed to be in bitreversed order, and can
* have arbitrary coefficients in int16_t.
*
* The output polynomial is in normal order, and
* coefficient-wise bound by MLK_INVNTT_BOUND in absolute value.
*
* Arguments: - uint16_t *a: pointer to in/output polynomial
*
* Specification: Implements composition of @[FIPS203, Algorithm 10, NTT^{-1}]
* and elementwise modular multiplication with a suitable
* Montgomery factor introduced during the base multiplication.
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_invntt_tomont(mlk_poly *r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND))
);
#endif /* !MLK_POLY_H */

View File

@@ -0,0 +1,457 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*
* - [NeonNTT]
* Neon NTT: Faster Dilithium, Kyber, and Saber on Cortex-A72 and Apple M1
* Becker, Hwang, Kannwischer, Yang, Yang
* https://eprint.iacr.org/2021/986
*
* - [REF]
* CRYSTALS-Kyber C reference implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
#include <stdint.h>
#include <string.h>
#include "compress.h"
#include "debug.h"
#include "poly_k.h"
#include "sampling.h"
#include "symmetric.h"
/* Parameter set namespacing
* This is to facilitate building multiple instances
* of mlkem-native (e.g. with varying parameter sets)
* within a single compilation unit. */
#define mlk_poly_cbd_eta1 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta1)
#define mlk_poly_cbd_eta2 MLK_ADD_PARAM_SET(mlk_poly_cbd_eta2)
/* End of parameter set namespacing */
/* Reference: `polyvec_compress()` in the reference implementation @[REF]
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
const mlk_polyvec a)
{
unsigned i;
mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
{
mlk_poly_compress_du(r + i * MLKEM_POLYCOMPRESSEDBYTES_DU, &a[i]);
}
}
/* Reference: `polyvec_decompress()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
void mlk_polyvec_decompress_du(mlk_polyvec r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
mlk_poly_decompress_du(&r[i], a + i * MLKEM_POLYCOMPRESSEDBYTES_DU);
}
mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_tobytes()` in the reference implementation @[REF].
* - In contrast to the reference implementation, we assume
* unsigned canonical coefficients here.
* The reference implementation works with coefficients
* in the range (-MLKEM_Q+1,...,MLKEM_Q-1). */
MLK_INTERNAL_API
void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
{
unsigned i;
mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
for (i = 0; i < MLKEM_K; i++)
__loop__(
assigns(i, object_whole(r))
invariant(i <= MLKEM_K)
)
{
mlk_poly_tobytes(&r[i * MLKEM_POLYBYTES], &a[i]);
}
}
/* Reference: `polyvec_frombytes()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
mlk_poly_frombytes(&r[i], a + i * MLKEM_POLYBYTES);
}
mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
}
/* Reference: `polyvec_ntt()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
void mlk_polyvec_ntt(mlk_polyvec r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
mlk_poly_ntt(&r[i]);
}
mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_NTT_BOUND);
}
/* Reference: `polyvec_invntt_tomont()` in the reference implementation @[REF].
* - We normalize at the beginning of the inverse NTT,
* while the reference implementation normalizes at
* the end. This allows us to drop a call to `poly_reduce()`
* from the base multiplication. */
MLK_INTERNAL_API
void mlk_polyvec_invntt_tomont(mlk_polyvec r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
mlk_poly_invntt_tomont(&r[i]);
}
mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLK_INVNTT_BOUND);
}
/* Reference: `polyvec_basemul_acc_montgomery()` in the
* reference implementation @[REF].
* - We use a multiplication cache ('mulcache') here
* which is not present in the reference implementation @[REF].
* This idea originates from @[NeonNTT] and is used
* at the C level here.
* - We compute the coefficients of the scalar product in 32-bit
* coefficients and perform only a single modular reduction
* at the end. The reference implementation uses 2 * MLKEM_K
* more modular reductions since it reduces after every modular
* multiplication. */
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
const mlk_polyvec_mulcache b_cache)
{
unsigned i;
mlk_assert_bound_2d(a, MLKEM_K, MLKEM_N, 0, MLKEM_UINT12_LIMIT);
#if defined(MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED)
{
int ret;
/* Omitting bounds assertion for cache since native implementations may
* decide not to use a mulcache. Note that the C backend implementation
* of poly_basemul_montgomery_cached() does still include the check. */
#if MLKEM_K == 2
ret = mlk_polyvec_basemul_acc_montgomery_cached_k2_native(
r->coeffs, (const int16_t *)a, (const int16_t *)b,
(const int16_t *)b_cache);
#elif MLKEM_K == 3
ret = mlk_polyvec_basemul_acc_montgomery_cached_k3_native(
r->coeffs, (const int16_t *)a, (const int16_t *)b,
(const int16_t *)b_cache);
#elif MLKEM_K == 4
ret = mlk_polyvec_basemul_acc_montgomery_cached_k4_native(
r->coeffs, (const int16_t *)a, (const int16_t *)b,
(const int16_t *)b_cache);
#endif
if (ret == MLK_NATIVE_FUNC_SUCCESS)
{
return;
}
}
#endif /* MLK_USE_NATIVE_POLYVEC_BASEMUL_ACC_MONTGOMERY_CACHED */
for (i = 0; i < MLKEM_N / 2; i++)
__loop__(invariant(i <= MLKEM_N / 2))
{
unsigned k;
int32_t t[2] = {0};
for (k = 0; k < MLKEM_K; k++)
__loop__(
invariant(k <= MLKEM_K &&
t[0] <= (int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768 &&
t[0] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
t[1] <= ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768) &&
t[1] >= - ((int32_t) k * 2 * MLKEM_UINT12_LIMIT * 32768)))
{
t[0] += (int32_t)a[k].coeffs[2 * i + 1] * b_cache[k].coeffs[i];
t[0] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i];
t[1] += (int32_t)a[k].coeffs[2 * i] * b[k].coeffs[2 * i + 1];
t[1] += (int32_t)a[k].coeffs[2 * i + 1] * b[k].coeffs[2 * i];
}
r->coeffs[2 * i + 0] = mlk_montgomery_reduce(t[0]);
r->coeffs[2 * i + 1] = mlk_montgomery_reduce(t[1]);
}
}
/* Reference: Does not exist in the reference implementation @[REF].
* - The reference implementation does not use a
* multiplication cache ('mulcache'). This idea originates
* from @[NeonNTT] and is used at the C level here. */
MLK_INTERNAL_API
void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
mlk_poly_mulcache_compute(&x[i], &a[i]);
}
}
/* Reference: `polyvec_reduce()` in the reference implementation @[REF].
* - We use _unsigned_ canonical outputs, while the reference
* implementation uses _signed_ canonical outputs.
* Accordingly, we need a conditional addition of MLKEM_Q
* here to go from signed to unsigned representatives.
* This conditional addition is then dropped from all
* polynomial compression functions instead (see `compress.c`). */
MLK_INTERNAL_API
void mlk_polyvec_reduce(mlk_polyvec r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
mlk_poly_reduce(&r[i]);
}
mlk_assert_bound_2d(r, MLKEM_K, MLKEM_N, 0, MLKEM_Q);
}
/* Reference: `polyvec_add()` in the reference implementation @[REF].
* - We use destructive version (output=first input) to avoid
* reasoning about aliasing in the CBMC specification */
MLK_INTERNAL_API
void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
mlk_poly_add(&r[i], &b[i]);
}
}
/* Reference: `polyvec_tomont()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
void mlk_polyvec_tomont(mlk_polyvec r)
{
unsigned i;
for (i = 0; i < MLKEM_K; i++)
{
mlk_poly_tomont(&r[i]);
}
mlk_assert_abs_bound_2d(r, MLKEM_K, MLKEM_N, MLKEM_Q);
}
/*************************************************
* Name: mlk_poly_cbd_eta1
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter MLKEM_ETA1.
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
*
* Specification: Implements @[FIPS203, Algorithm 8, SamplePolyCBD_eta1], where
* eta1 is specified per parameter set in @[FIPS203, Table 2]
* and represented as MLKEM_ETA1 here.
*
**************************************************/
/* Reference: `poly_cbd_eta1` in the reference implementation @[REF]. */
static MLK_INLINE void mlk_poly_cbd_eta1(
mlk_poly *r, const uint8_t buf[MLKEM_ETA1 * MLKEM_N / 4])
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(buf, MLKEM_ETA1 * MLKEM_N / 4))
assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA1 + 1))
)
{
#if MLKEM_ETA1 == 2
mlk_poly_cbd2(r, buf);
#elif MLKEM_ETA1 == 3
mlk_poly_cbd3(r, buf);
#else
#error "Invalid value of MLKEM_ETA1"
#endif
}
/* Reference: Does not exist in the reference implementation @[REF].
* - This implements a x4-batched version of `poly_getnoise_eta1()`
* from the reference implementation, to leverage
* batched Keccak-f1600.*/
MLK_INTERNAL_API
void mlk_poly_getnoise_eta1_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
mlk_poly *r3, const uint8_t seed[MLKEM_SYMBYTES],
uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
uint8_t nonce3)
{
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
extkey[3][MLKEM_SYMBYTES] = nonce3;
mlk_prf_eta1_x4(buf, extkey);
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
mlk_poly_cbd_eta1(r2, buf[2]);
mlk_poly_cbd_eta1(r3, buf[3]);
mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA1 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
mlk_zeroize(extkey, sizeof(extkey));
}
#if MLKEM_K == 2 || MLKEM_K == 4
/*************************************************
* Name: mlk_poly_cbd_eta2
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter MLKEM_ETA2.
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
*
* Specification: Implements @[FIPS203, Algorithm 8, SamplePolyCBD_eta2], where
* eta2 is specified per parameter set in @[FIPS203, Table 2]
* and represented as MLKEM_ETA2 here.
*
**************************************************/
/* Reference: `poly_cbd_eta2` in the reference implementation @[REF]. */
static MLK_INLINE void mlk_poly_cbd_eta2(
mlk_poly *r, const uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4])
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(buf, MLKEM_ETA2 * MLKEM_N / 4))
assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1)))
{
#if MLKEM_ETA2 == 2
mlk_poly_cbd2(r, buf);
#else
#error "Invalid value of MLKEM_ETA2"
#endif
}
/* Reference: `poly_getnoise_eta1()` in the reference implementation @[REF].
* - We include buffer zeroization. */
MLK_INTERNAL_API
void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
uint8_t nonce)
{
MLK_ALIGN uint8_t buf[MLKEM_ETA2 * MLKEM_N / 4];
MLK_ALIGN uint8_t extkey[MLKEM_SYMBYTES + 1];
mlk_memcpy(extkey, seed, MLKEM_SYMBYTES);
extkey[MLKEM_SYMBYTES] = nonce;
mlk_prf_eta2(buf, extkey);
mlk_poly_cbd_eta2(r, buf);
mlk_assert_abs_bound(r, MLKEM_N, MLKEM_ETA1 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
mlk_zeroize(extkey, sizeof(extkey));
}
#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
#if MLKEM_K == 2
/* Reference: Does not exist in the reference implementation @[REF].
* - This implements a x4-batched version of `poly_getnoise_eta1()`
* and `poly_getnoise_eta1()` from the reference implementation,
* leveraging batched Keccak-f1600.
* - If a x4-batched Keccak-f1600 is available, we squeeze
* more random data than needed for the eta2 calls, to be
* be able to use a x4-batched Keccak-f1600. */
MLK_INTERNAL_API
void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
mlk_poly *r3,
const uint8_t seed[MLKEM_SYMBYTES],
uint8_t nonce0, uint8_t nonce1,
uint8_t nonce2, uint8_t nonce3)
{
#if MLKEM_ETA2 >= MLKEM_ETA1
#error mlk_poly_getnoise_eta1122_4x assumes MLKEM_ETA1 > MLKEM_ETA2
#endif
MLK_ALIGN uint8_t buf[4][MLK_ALIGN_UP(MLKEM_ETA1 * MLKEM_N / 4)];
MLK_ALIGN uint8_t extkey[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 1)];
mlk_memcpy(extkey[0], seed, MLKEM_SYMBYTES);
mlk_memcpy(extkey[1], seed, MLKEM_SYMBYTES);
mlk_memcpy(extkey[2], seed, MLKEM_SYMBYTES);
mlk_memcpy(extkey[3], seed, MLKEM_SYMBYTES);
extkey[0][MLKEM_SYMBYTES] = nonce0;
extkey[1][MLKEM_SYMBYTES] = nonce1;
extkey[2][MLKEM_SYMBYTES] = nonce2;
extkey[3][MLKEM_SYMBYTES] = nonce3;
/* On systems with fast batched Keccak, we use 4-fold batched PRF,
* even though that means generating more random data in buf[2] and buf[3]
* than necessary. */
#if !defined(FIPS202_X4_DEFAULT_IMPLEMENTATION)
mlk_prf_eta1_x4(buf, extkey);
#else
mlk_prf_eta1(buf[0], extkey[0]);
mlk_prf_eta1(buf[1], extkey[1]);
mlk_prf_eta2(buf[2], extkey[2]);
mlk_prf_eta2(buf[3], extkey[3]);
#endif /* FIPS202_X4_DEFAULT_IMPLEMENTATION */
mlk_poly_cbd_eta1(r0, buf[0]);
mlk_poly_cbd_eta1(r1, buf[1]);
mlk_poly_cbd_eta2(r2, buf[2]);
mlk_poly_cbd_eta2(r3, buf[3]);
mlk_assert_abs_bound(r0, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r1, MLKEM_N, MLKEM_ETA1 + 1);
mlk_assert_abs_bound(r2, MLKEM_N, MLKEM_ETA2 + 1);
mlk_assert_abs_bound(r3, MLKEM_N, MLKEM_ETA2 + 1);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
mlk_zeroize(extkey, sizeof(extkey));
}
#endif /* MLKEM_K == 2 */
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef mlk_poly_cbd_eta1
#undef mlk_poly_cbd_eta2

View File

@@ -0,0 +1,655 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*/
#ifndef MLK_POLY_K_H
#define MLK_POLY_K_H
#include <stdint.h>
#include "common.h"
#include "compress.h"
#include "poly.h"
/* Parameter set namespacing
* This is to facilitate building multiple instances
* of mlkem-native (e.g. with varying parameter sets)
* within a single compilation unit. */
#define mlk_polyvec MLK_ADD_PARAM_SET(mlk_polyvec)
#define mlk_polymat MLK_ADD_PARAM_SET(mlk_polymat)
#define mlk_polyvec_mulcache MLK_ADD_PARAM_SET(mlk_polyvec_mulcache)
/* End of parameter set namespacing */
typedef mlk_poly mlk_polyvec[MLKEM_K];
typedef mlk_poly mlk_polymat[MLKEM_K * MLKEM_K];
typedef mlk_poly_mulcache mlk_polyvec_mulcache[MLKEM_K];
#define mlk_poly_compress_du MLK_NAMESPACE_K(poly_compress_du)
/*************************************************
* Name: mlk_poly_compress_du
*
* Description: Compression (du bits) and subsequent serialization of a
* polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
* - const mlk_poly *a: pointer to input polynomial
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
*
* Specification: Implements `ByteEncode_{d_u} (Compress_{d_u} (u))`
* in @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22],
* with level-specific d_u defined in @[FIPS203, Table 2],
* and given by MLKEM_DU here.
*
**************************************************/
static MLK_INLINE void mlk_poly_compress_du(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DU], const mlk_poly *a)
__contract__(
requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DU))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
assigns(memory_slice(r, MLKEM_POLYCOMPRESSEDBYTES_DU)))
{
#if MLKEM_DU == 10
mlk_poly_compress_d10(r, a);
#elif MLKEM_DU == 11
mlk_poly_compress_d11(r, a);
#else
#error "Invalid value of MLKEM_DU"
#endif
}
#define mlk_poly_decompress_du MLK_NAMESPACE_K(poly_decompress_du)
/*************************************************
* Name: mlk_poly_decompress_du
*
* Description: De-serialization and subsequent decompression (du bits) of a
* polynomial; approximate inverse of mlk_poly_compress_du
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_DU bytes)
*
* Upon return, the coefficients of the output polynomial are unsigned-canonical
* (non-negative and smaller than MLKEM_Q).
*
* Specification: Implements `Decompress_{d_u} (ByteDecode_{d_u} (u))`
* in @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L3].
* with level-specific d_u defined in @[FIPS203, Table 2],
* and given by MLKEM_DU here.
*
**************************************************/
static MLK_INLINE void mlk_poly_decompress_du(
mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DU])
__contract__(
requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DU))
requires(memory_no_alias(r, sizeof(mlk_poly)))
assigns(memory_slice(r, sizeof(mlk_poly)))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
{
#if MLKEM_DU == 10
mlk_poly_decompress_d10(r, a);
#elif MLKEM_DU == 11
mlk_poly_decompress_d11(r, a);
#else
#error "Invalid value of MLKEM_DU"
#endif
}
#define mlk_poly_compress_dv MLK_NAMESPACE_K(poly_compress_dv)
/*************************************************
* Name: mlk_poly_compress_dv
*
* Description: Compression (dv bits) and subsequent serialization of a
* polynomial
*
* Arguments: - uint8_t *r: pointer to output byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
* - const mlk_poly *a: pointer to input polynomial
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
*
* Specification: Implements `ByteEncode_{d_v} (Compress_{d_v} (v))`
* in @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L23].
* with level-specific d_v defined in @[FIPS203, Table 2],
* and given by MLKEM_DV here.
*
**************************************************/
static MLK_INLINE void mlk_poly_compress_dv(
uint8_t r[MLKEM_POLYCOMPRESSEDBYTES_DV], const mlk_poly *a)
__contract__(
requires(memory_no_alias(r, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(a, sizeof(mlk_poly)))
requires(array_bound(a->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
assigns(object_whole(r)))
{
#if MLKEM_DV == 4
mlk_poly_compress_d4(r, a);
#elif MLKEM_DV == 5
mlk_poly_compress_d5(r, a);
#else
#error "Invalid value of MLKEM_DV"
#endif
}
#define mlk_poly_decompress_dv MLK_NAMESPACE_K(poly_decompress_dv)
/*************************************************
* Name: mlk_poly_decompress_dv
*
* Description: De-serialization and subsequent decompression (dv bits) of a
* polynomial; approximate inverse of poly_compress
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *a: pointer to input byte array
* (of length MLKEM_POLYCOMPRESSEDBYTES_DV bytes)
*
* Upon return, the coefficients of the output polynomial are unsigned-canonical
* (non-negative and smaller than MLKEM_Q).
*
* Specification: Implements `Decompress_{d_v} (ByteDecode_{d_v} (v))`
* in @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L4].
* with level-specific d_v defined in @[FIPS203, Table 2],
* and given by MLKEM_DV here.
*
**************************************************/
static MLK_INLINE void mlk_poly_decompress_dv(
mlk_poly *r, const uint8_t a[MLKEM_POLYCOMPRESSEDBYTES_DV])
__contract__(
requires(memory_no_alias(a, MLKEM_POLYCOMPRESSEDBYTES_DV))
requires(memory_no_alias(r, sizeof(mlk_poly)))
assigns(object_whole(r))
ensures(array_bound(r->coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
{
#if MLKEM_DV == 4
mlk_poly_decompress_d4(r, a);
#elif MLKEM_DV == 5
mlk_poly_decompress_d5(r, a);
#else
#error "Invalid value of MLKEM_DV"
#endif
}
#define mlk_polyvec_compress_du MLK_NAMESPACE_K(polyvec_compress_du)
/*************************************************
* Name: mlk_polyvec_compress_du
*
* Description: Compress and serialize vector of polynomials
*
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for MLKEM_POLYVECCOMPRESSEDBYTES_DU)
* - const mlk_polyvec a: pointer to input vector of polynomials.
* Coefficients must be unsigned canonical,
* i.e. in [0,1,..,MLKEM_Q-1].
*
* Specification: Implements `ByteEncode_{d_u} (Compress_{d_u} (u))`
* in @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L22].
* with level-specific d_u defined in @[FIPS203, Table 2],
* and given by MLKEM_DU here.
*
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_compress_du(uint8_t r[MLKEM_POLYVECCOMPRESSEDBYTES_DU],
const mlk_polyvec a)
__contract__(
requires(memory_no_alias(r, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(forall(k0, 0, MLKEM_K,
array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
assigns(object_whole(r))
);
#define mlk_polyvec_decompress_du MLK_NAMESPACE_K(polyvec_decompress_du)
/*************************************************
* Name: mlk_polyvec_decompress_du
*
* Description: De-serialize and decompress vector of polynomials;
* approximate inverse of mlk_polyvec_compress_du
*
* Arguments: - mlk_polyvec r: pointer to output vector of polynomials.
* Output will have coefficients normalized to [0,..,q-1].
* - const uint8_t *a: pointer to input byte array
* (of length MLKEM_POLYVECCOMPRESSEDBYTES_DU)
*
* Specification: Implements `Decompress_{d_u} (ByteDecode_{d_u} (u))`
* in @[FIPS203, Algorithm 15 (K-PKE.Decrypt), L3].
* with level-specific d_u defined in @[FIPS203, Table 2],
* and given by MLKEM_DU here.
*
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_decompress_du(mlk_polyvec r,
const uint8_t a[MLKEM_POLYVECCOMPRESSEDBYTES_DU])
__contract__(
requires(memory_no_alias(a, MLKEM_POLYVECCOMPRESSEDBYTES_DU))
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
assigns(object_whole(r))
ensures(forall(k0, 0, MLKEM_K,
array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_tobytes MLK_NAMESPACE_K(polyvec_tobytes)
/*************************************************
* Name: mlk_polyvec_tobytes
*
* Description: Serialize vector of polynomials
*
* Arguments: - uint8_t *r: pointer to output byte array
* (needs space for MLKEM_POLYVECBYTES)
* - const mlk_polyvec a: pointer to input vector of polynomials
* Each polynomial must have coefficients in [0,..,q-1].
*
* Specification: Implements ByteEncode_12 @[FIPS203, Algorithm 5].
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* and @[FIPS203, 2.4.6, Matrices and Vectors]
*
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_tobytes(uint8_t r[MLKEM_POLYVECBYTES], const mlk_polyvec a)
__contract__(
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(r, MLKEM_POLYVECBYTES))
requires(forall(k0, 0, MLKEM_K,
array_bound(a[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
assigns(object_whole(r))
);
#define mlk_polyvec_frombytes MLK_NAMESPACE_K(polyvec_frombytes)
/*************************************************
* Name: mlk_polyvec_frombytes
*
* Description: De-serialize vector of polynomials;
* inverse of mlk_polyvec_tobytes
*
* Arguments: - const mlk_polyvec a: pointer to output vector of polynomials
* (of length MLKEM_POLYVECBYTES). Output will have coefficients
* normalized in [0..4095].
* - uint8_t *r: pointer to input byte array
*
* Specification: Implements ByteDecode_12 @[FIPS203, Algorithm 6].
* Extended to vectors as per
* @[FIPS203, 2.4.8 Applying Algorithms to Arrays]
* and @[FIPS203, 2.4.6, Matrices and Vectors]
*
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_frombytes(mlk_polyvec r, const uint8_t a[MLKEM_POLYVECBYTES])
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(a, MLKEM_POLYVECBYTES))
assigns(object_whole(r))
ensures(forall(k0, 0, MLKEM_K,
array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
);
#define mlk_polyvec_ntt MLK_NAMESPACE_K(polyvec_ntt)
/*************************************************
* Name: mlk_polyvec_ntt
*
* Description: Apply forward NTT to all elements of a vector of polynomials.
*
* The input is assumed to be in normal order and
* coefficient-wise bound by MLKEM_Q in absolute value.
*
* The output polynomial is in bitreversed order, and
* coefficient-wise bound by MLK_NTT_BOUND in absolute value.
*
* Arguments: - mlk_polyvec r: pointer to in/output vector of polynomials
*
* Specification:
* - Implements @[FIPS203, Algorithm 9, NTT]
* - Extended to vectors as per @[FIPS203, 2.4.6, Matrices and Vectors]
*
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_ntt(mlk_polyvec r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(forall(j, 0, MLKEM_K,
array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
assigns(object_whole(r))
ensures(forall(j, 0, MLKEM_K,
array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_NTT_BOUND)))
);
#define mlk_polyvec_invntt_tomont MLK_NAMESPACE_K(polyvec_invntt_tomont)
/*************************************************
* Name: mlk_polyvec_invntt_tomont
*
* Description: Apply inverse NTT to all elements of a vector of polynomials
* and multiply by Montgomery factor 2^16
*
* The input is assumed to be in bitreversed order, and can
* have arbitrary coefficients in int16_t.
*
* The output polynomial is in normal order, and
* coefficient-wise bound by MLK_INVNTT_BOUND in absolute value.
*
* Arguments: - mlk_polyvec r: pointer to in/output vector of polynomials
*
* Specification:
* - Implements @[FIPS203, Algorithm 10, NTT^{-1}]
* - Extended to vectors as per @[FIPS203, 2.4.6, Matrices and Vectors]
*
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_invntt_tomont(mlk_polyvec r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
assigns(object_whole(r))
ensures(forall(j, 0, MLKEM_K,
array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLK_INVNTT_BOUND)))
);
#define mlk_polyvec_basemul_acc_montgomery_cached \
MLK_NAMESPACE_K(polyvec_basemul_acc_montgomery_cached)
/*************************************************
* Name: mlk_polyvec_basemul_acc_montgomery_cached
*
* Description: Scalar product of two vectors of polynomials in NTT domain,
* using mulcache for second operand.
*
* Bounds:
* - Every coefficient of a is assumed to be in [0..4095]
* - No bounds guarantees for the coefficients in the result.
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const mlk_polyvec a: pointer to first input polynomial vector
* - const mlk_polyvec b: pointer to second input polynomial
* vector
* - const mlk_polyvec_mulcache b_cache: pointer to mulcache
* for second input polynomial vector. Can be computed
* via mlk_polyvec_mulcache_compute().
*
* Specification: Implements
* - @[FIPS203, Section 2.4.7, Eq (2.14)]
* - @[FIPS203, Algorithm 11, MultiplyNTTs]
* - @[FIPS203, Algorithm 12, BaseCaseMultiply]
*
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_basemul_acc_montgomery_cached(
mlk_poly *r, const mlk_polyvec a, const mlk_polyvec b,
const mlk_polyvec_mulcache b_cache)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(memory_no_alias(b_cache, sizeof(mlk_polyvec_mulcache)))
requires(forall(k1, 0, MLKEM_K,
array_bound(a[k1].coeffs, 0, MLKEM_N, 0, MLKEM_UINT12_LIMIT)))
assigns(object_whole(r))
);
#define mlk_polyvec_mulcache_compute MLK_NAMESPACE_K(polyvec_mulcache_compute)
/************************************************************
* Name: mlk_polyvec_mulcache_compute
*
* Description: Computes the mulcache for a vector of polynomials in NTT domain
*
* The mulcache of a degree-2 polynomial b := b0 + b1*X
* in Fq[X]/(X^2-zeta) is the value b1*zeta, needed when
* computing products of b in Fq[X]/(X^2-zeta).
*
* The mulcache of a polynomial in NTT domain -- which is
* a 128-tuple of degree-2 polynomials in Fq[X]/(X^2-zeta),
* for varying zeta, is the 128-tuple of mulcaches of those
* polynomials.
*
* The mulcache of a vector of polynomials is the vector
* of mulcaches of its entries.
*
* Arguments: - x: Pointer to mulcache to be populated
* - a: Pointer to input polynomial vector
*
* Specification:
* - Caches `b_1 * \gamma` in @[FIPS203, Algorithm 12, BaseCaseMultiply, L1]
*
************************************************************/
/*
* NOTE: The default C implementation of this function populates
* the mulcache with values in (-q,q), but this is not needed for the
* higher level safety proofs, and thus not part of the spec.
*/
MLK_INTERNAL_API
void mlk_polyvec_mulcache_compute(mlk_polyvec_mulcache x, const mlk_polyvec a)
__contract__(
requires(memory_no_alias(x, sizeof(mlk_polyvec_mulcache)))
requires(memory_no_alias(a, sizeof(mlk_polyvec)))
assigns(object_whole(x))
);
#define mlk_polyvec_reduce MLK_NAMESPACE_K(polyvec_reduce)
/*************************************************
* Name: mlk_polyvec_reduce
*
* Description: Applies Barrett reduction to each coefficient
* of each element of a vector of polynomials;
* for details of the Barrett reduction see comments in reduce.c
*
* Arguments: - mlk_polyvec r: pointer to input/output polynomial
*
* Specification: Normalizes on unsigned canoncial representatives
* ahead of calling @[FIPS203, Compress_d, Eq (4.7)].
* This is not made explicit in FIPS 203.
*
**************************************************/
/*
* NOTE: The semantics of mlk_polyvec_reduce() is different in
* the reference implementation, which requires
* signed canonical output data. Unsigned canonical
* outputs are better suited to the only remaining
* use of mlk_poly_reduce() in the context of (de)serialization.
*/
MLK_INTERNAL_API
void mlk_polyvec_reduce(mlk_polyvec r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
assigns(object_whole(r))
ensures(forall(k0, 0, MLKEM_K,
array_bound(r[k0].coeffs, 0, MLKEM_N, 0, MLKEM_Q)))
);
#define mlk_polyvec_add MLK_NAMESPACE_K(polyvec_add)
/*************************************************
* Name: mlk_polyvec_add
*
* Description: Add vectors of polynomials
*
* Arguments: - mlk_polyvec r: pointer to input-output vector of polynomials to
* be added to
* - const mlk_polyvec b: pointer to second input vector of
* polynomials
*
* The coefficients of r and b must be so that the addition does
* not overflow. Otherwise, the behaviour of this function is undefined.
*
* The coefficients returned in *r are in int16_t which is sufficient
* to prove type-safety of calling units. Therefore, no stronger
* ensures clause is required on this function.
*
* Specification:
* - @[FIPS203, 2.4.5, Arithmetic With Polynomials and NTT Representations]
* - Used in @[FIPS203, Algorithm 14 (K-PKE.Encrypt), L19]
*
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_add(mlk_polyvec r, const mlk_polyvec b)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
requires(memory_no_alias(b, sizeof(mlk_polyvec)))
requires(forall(j0, 0, MLKEM_K,
forall(k0, 0, MLKEM_N,
(int32_t)r[j0].coeffs[k0] + b[j0].coeffs[k0] <= INT16_MAX)))
requires(forall(j1, 0, MLKEM_K,
forall(k1, 0, MLKEM_N,
(int32_t)r[j1].coeffs[k1] + b[j1].coeffs[k1] >= INT16_MIN)))
assigns(object_whole(r))
);
#define mlk_polyvec_tomont MLK_NAMESPACE_K(polyvec_tomont)
/*************************************************
* Name: mlk_polyvec_tomont
*
* Description: Inplace conversion of all coefficients of a polynomial
* vector from normal domain to Montgomery domain
*
* Bounds: Output < q in absolute value.
*
*
* Specification: Internal normalization required in `mlk_indcpa_keypair_derand`
* as part of matrix-vector multiplication
* @[FIPS203, Algorithm 13, K-PKE.KeyGen, L18].
*
**************************************************/
MLK_INTERNAL_API
void mlk_polyvec_tomont(mlk_polyvec r)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_polyvec)))
assigns(memory_slice(r, sizeof(mlk_polyvec)))
assigns(object_whole(r))
ensures(forall(j, 0, MLKEM_K,
array_abs_bound(r[j].coeffs, 0, MLKEM_N, MLKEM_Q)))
);
#define mlk_poly_getnoise_eta1_4x MLK_NAMESPACE_K(poly_getnoise_eta1_4x)
/*************************************************
* Name: mlk_poly_getnoise_eta1_4x
*
* Description: Batch sample four polynomials deterministically from a seed
* and nonces, with output polynomials close to centered binomial
* distribution with parameter MLKEM_ETA1.
*
* Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length MLKEM_SYMBYTES bytes)
* - uint8_t nonce{0,1,2,3}: one-byte input nonce
*
* Specification:
* Implements 4x `SamplePolyCBD_{eta1} (PRF_{eta1} (sigma, N))`:
* - @[FIPS203, Algorithm 8, SamplePolyCBD_eta]
* - @[FIPS203, Eq (4.3), PRF_eta]
* - `SamplePolyCBD_{eta1} (PRF_{eta1} (sigma, N))` appears in
* @[FIPS203, Algorithm 13, K-PKE.KeyGen, L{9, 13}]
* @[FIPS203, Algorithm 14, K-PKE.Encrypt, L10]
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_getnoise_eta1_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
mlk_poly *r3, const uint8_t seed[MLKEM_SYMBYTES],
uint8_t nonce0, uint8_t nonce1, uint8_t nonce2,
uint8_t nonce3)
__contract__(
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
requires(memory_no_alias(r0, sizeof(mlk_poly)))
requires(memory_no_alias(r1, sizeof(mlk_poly)))
requires(memory_no_alias(r2, sizeof(mlk_poly)))
requires(memory_no_alias(r3, sizeof(mlk_poly)))
assigns(memory_slice(r0, sizeof(mlk_poly)))
assigns(memory_slice(r1, sizeof(mlk_poly)))
assigns(memory_slice(r2, sizeof(mlk_poly)))
assigns(memory_slice(r3, sizeof(mlk_poly)))
ensures(
array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
&& array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
&& array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
&& array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1));
);
#if MLKEM_ETA1 == MLKEM_ETA2
/*
* We only require mlk_poly_getnoise_eta2_4x for ml-kem-768 and ml-kem-1024
* where MLKEM_ETA2 = MLKEM_ETA1 = 2.
* For ml-kem-512, mlk_poly_getnoise_eta1122_4x is used instead.
*/
#define mlk_poly_getnoise_eta2_4x mlk_poly_getnoise_eta1_4x
#endif /* MLKEM_ETA1 == MLKEM_ETA2 */
#if MLKEM_K == 2 || MLKEM_K == 4
#define mlk_poly_getnoise_eta2 MLK_NAMESPACE_K(poly_getnoise_eta2)
/*************************************************
* Name: mlk_poly_getnoise_eta2
*
* Description: Sample a polynomial deterministically from a seed and a nonce,
* with output polynomial close to centered binomial distribution
* with parameter MLKEM_ETA2
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length MLKEM_SYMBYTES bytes)
* - uint8_t nonce: one-byte input nonce
*
* Specification:
* Implements `SamplePolyCBD_{eta2} (PRF_{eta2} (sigma, N))`:
* - @[FIPS203, Algorithm 8, SamplePolyCBD_eta]
* - @[FIPS203, Eq (4.3), PRF_eta]
* - `SamplePolyCBD_{eta2} (PRF_{eta2} (sigma, N))` appears in
* @[FIPS203, Algorithm 14, K-PKE.Encrypt, L14]
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_getnoise_eta2(mlk_poly *r, const uint8_t seed[MLKEM_SYMBYTES],
uint8_t nonce)
__contract__(
requires(memory_no_alias(r, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
assigns(object_whole(r))
ensures(array_abs_bound(r->coeffs, 0, MLKEM_N, MLKEM_ETA2 + 1))
);
#endif /* MLKEM_K == 2 || MLKEM_K == 4 */
#if MLKEM_K == 2
#define mlk_poly_getnoise_eta1122_4x MLK_NAMESPACE_K(poly_getnoise_eta1122_4x)
/*************************************************
* Name: mlk_poly_getnoise_eta1122_4x
*
* Description: Batch sample four polynomials deterministically from a seed
* and a nonces, with output polynomials close to centered binomial
* distribution with parameter MLKEM_ETA1 and MLKEM_ETA2
*
* Arguments: - mlk_poly *r{0,1,2,3}: pointer to output polynomial
* - const uint8_t *seed: pointer to input seed
* (of length MLKEM_SYMBYTES bytes)
* - uint8_t nonce{0,1,2,3}: one-byte input nonce
*
* Specification:
* Implements two instances each of
* `SamplePolyCBD_{eta1} (PRF_{eta1} (sigma, N))` and
* `SamplePolyCBD_{eta2} (PRF_{eta2} (sigma, N))`:
* - @[FIPS203, Algorithm 8, SamplePolyCBD_eta]
* - @[FIPS203, Eq (4.3), PRF_eta]
* - `SamplePolyCBD_{eta2} (PRF_{eta2} (sigma, N))` appears in
* @[FIPS203, Algorithm 14, K-PKE.Encrypt, L14]
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_getnoise_eta1122_4x(mlk_poly *r0, mlk_poly *r1, mlk_poly *r2,
mlk_poly *r3,
const uint8_t seed[MLKEM_SYMBYTES],
uint8_t nonce0, uint8_t nonce1,
uint8_t nonce2, uint8_t nonce3)
__contract__(
requires( /* r0, r1 consecutive, r2, r3 consecutive */
(memory_no_alias(r0, 2 * sizeof(mlk_poly)) && memory_no_alias(r2, 2 * sizeof(mlk_poly)) &&
r1 == r0 + 1 && r3 == r2 + 1 && !same_object(r0, r2)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES))
assigns(object_whole(r0), object_whole(r1), object_whole(r2), object_whole(r3))
ensures(array_abs_bound(r0->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
&& array_abs_bound(r1->coeffs,0, MLKEM_N, MLKEM_ETA1 + 1)
&& array_abs_bound(r2->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1)
&& array_abs_bound(r3->coeffs,0, MLKEM_N, MLKEM_ETA2 + 1));
);
#endif /* MLKEM_K == 2 */
#endif /* !MLK_POLY_K_H */

View File

@@ -0,0 +1,22 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#ifndef MLK_RANDOMBYTES_H
#define MLK_RANDOMBYTES_H
#include <stddef.h>
#include <stdint.h>
#include "cbmc.h"
#include "common.h"
#if !defined(MLK_CONFIG_CUSTOM_RANDOMBYTES)
void randombytes(uint8_t *out, size_t outlen);
static MLK_INLINE void mlk_randombytes(uint8_t *out, size_t outlen)
__contract__(
requires(memory_no_alias(out, outlen))
assigns(memory_slice(out, outlen))) { randombytes(out, outlen); }
#endif /* !MLK_CONFIG_CUSTOM_RANDOMBYTES */
#endif /* !MLK_RANDOMBYTES_H */

View File

@@ -0,0 +1,362 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*
* - [REF]
* CRYSTALS-Kyber C reference implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/ref
*/
#include "common.h"
#if !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include "debug.h"
#include "sampling.h"
#include "symmetric.h"
/* Reference: `rej_uniform()` in the reference implementation @[REF].
* - Our signature differs from the reference implementation
* in that it adds the offset and always expects the base of the
* target buffer. This avoids shifting the buffer base in the
* caller, which appears tricky to reason about. */
static unsigned mlk_rej_uniform_scalar(int16_t *r, unsigned target,
unsigned offset, const uint8_t *buf,
unsigned buflen)
__contract__(
requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
requires(memory_no_alias(r, sizeof(int16_t) * target))
requires(memory_no_alias(buf, buflen))
requires(array_bound(r, 0, offset, 0, MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * target))
ensures(offset <= return_value && return_value <= target)
ensures(array_bound(r, 0, return_value, 0, MLKEM_Q))
)
{
unsigned ctr, pos;
uint16_t val0, val1;
mlk_assert_bound(r, offset, 0, MLKEM_Q);
ctr = offset;
pos = 0;
/* pos + 3 cannot overflow due to the assumption buflen <= 4096 */
while (ctr < target && pos + 3 <= buflen)
__loop__(
invariant(offset <= ctr && ctr <= target && pos <= buflen)
invariant(array_bound(r, 0, ctr, 0, MLKEM_Q)))
{
val0 = ((buf[pos + 0] >> 0) | ((uint16_t)buf[pos + 1] << 8)) & 0xFFF;
val1 = ((buf[pos + 1] >> 4) | ((uint16_t)buf[pos + 2] << 4)) & 0xFFF;
pos += 3;
if (val0 < MLKEM_Q)
{
r[ctr++] = val0;
}
if (ctr < target && val1 < MLKEM_Q)
{
r[ctr++] = val1;
}
}
mlk_assert_bound(r, ctr, 0, MLKEM_Q);
return ctr;
}
/*************************************************
* Name: mlk_rej_uniform
*
* Description: Run rejection sampling on uniform random bytes to generate
* uniform random integers mod q
*
* Arguments: - int16_t *r: pointer to output buffer
* - unsigned target: requested number of 16-bit integers
* (uniform mod q).
* Must be <= 4096.
* - unsigned offset: number of 16-bit integers that have
* already been sampled.
* Must be <= target.
* - const uint8_t *buf: pointer to input buffer
* (assumed to be uniform random bytes)
* - unsigned buflen: length of input buffer in bytes
* Must be <= 4096.
* Must be a multiple of 3.
*
* Note: Strictly speaking, only a few values of buflen near UINT_MAX need
* excluding. The limit of 128 is somewhat arbitrary but sufficient for all
* uses of this function. Similarly, the actual limit for target is UINT_MAX/2.
*
* Returns the new offset of sampled 16-bit integers, at most target,
* and at least the initial offset.
* If the new offset is strictly less than len, all of the input buffers
* is guaranteed to have been consumed. If it is equal to len, no information
* is provided on how many bytes of the input buffer have been consumed.
**************************************************/
/* Reference: `rej_uniform()` in the reference implementation @[REF].
* - Our signature differs from the reference implementation
* in that it adds the offset and always expects the base of the
* target buffer. This avoids shifting the buffer base in the
* caller, which appears tricky to reason about.
* - Optional fallback to native implementation. */
static unsigned mlk_rej_uniform(int16_t *r, unsigned target, unsigned offset,
const uint8_t *buf, unsigned buflen)
__contract__(
requires(offset <= target && target <= 4096 && buflen <= 4096 && buflen % 3 == 0)
requires(memory_no_alias(r, sizeof(int16_t) * target))
requires(memory_no_alias(buf, buflen))
requires(array_bound(r, 0, offset, 0, MLKEM_Q))
assigns(memory_slice(r, sizeof(int16_t) * target))
ensures(offset <= return_value && return_value <= target)
ensures(array_bound(r, 0, return_value, 0, MLKEM_Q))
)
{
#if defined(MLK_USE_NATIVE_REJ_UNIFORM)
if (offset == 0)
{
int ret;
ret = mlk_rej_uniform_native(r, target, buf, buflen);
if (ret != MLK_NATIVE_FUNC_FALLBACK)
{
unsigned res = (unsigned)ret;
mlk_assert_bound(r, res, 0, MLKEM_Q);
return res;
}
}
#endif /* MLK_USE_NATIVE_REJ_UNIFORM */
return mlk_rej_uniform_scalar(r, target, offset, buf, buflen);
}
#ifndef MLKEM_GEN_MATRIX_NBLOCKS
#define MLKEM_GEN_MATRIX_NBLOCKS \
((12 * MLKEM_N / 8 * (1 << 12) / MLKEM_Q + MLK_XOF_RATE) / MLK_XOF_RATE)
#endif
/* Reference: Does not exist in the reference implementation @[REF].
* - x4-batched version of `rej_uniform()` from the
* reference implementation, leveraging x4-batched Keccak-f1600. */
MLK_INTERNAL_API
void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
{
/* Temporary buffers for XOF output before rejection sampling */
MLK_ALIGN uint8_t
buf[4][MLK_ALIGN_UP(MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE)];
/* Tracks the number of coefficients we have already sampled */
unsigned ctr[4];
mlk_xof_x4_ctx statex;
unsigned buflen;
mlk_xof_x4_init(&statex);
mlk_xof_x4_absorb(&statex, seed, MLKEM_SYMBYTES + 2);
/*
* Initially, squeeze heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
* This should generate the matrix entries with high probability.
*/
mlk_xof_x4_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &statex);
buflen = MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE;
ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, 0, buf[0], buflen);
ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, 0, buf[1], buflen);
ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, 0, buf[2], buflen);
ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, 0, buf[3], buflen);
/*
* So long as not all matrix entries have been generated, squeeze
* one more block a time until we're done.
*/
buflen = MLK_XOF_RATE;
while (ctr[0] < MLKEM_N || ctr[1] < MLKEM_N || ctr[2] < MLKEM_N ||
ctr[3] < MLKEM_N)
__loop__(
assigns(ctr, statex,
memory_slice(vec0, sizeof(mlk_poly)),
memory_slice(vec1, sizeof(mlk_poly)),
memory_slice(vec2, sizeof(mlk_poly)),
memory_slice(vec3, sizeof(mlk_poly)),
object_whole(buf[0]),
object_whole(buf[1]),
object_whole(buf[2]),
object_whole(buf[3]))
invariant(ctr[0] <= MLKEM_N && ctr[1] <= MLKEM_N)
invariant(ctr[2] <= MLKEM_N && ctr[3] <= MLKEM_N)
invariant(array_bound(vec0->coeffs, 0, ctr[0], 0, MLKEM_Q))
invariant(array_bound(vec1->coeffs, 0, ctr[1], 0, MLKEM_Q))
invariant(array_bound(vec2->coeffs, 0, ctr[2], 0, MLKEM_Q))
invariant(array_bound(vec3->coeffs, 0, ctr[3], 0, MLKEM_Q)))
{
mlk_xof_x4_squeezeblocks(buf, 1, &statex);
ctr[0] = mlk_rej_uniform(vec0->coeffs, MLKEM_N, ctr[0], buf[0], buflen);
ctr[1] = mlk_rej_uniform(vec1->coeffs, MLKEM_N, ctr[1], buf[1], buflen);
ctr[2] = mlk_rej_uniform(vec2->coeffs, MLKEM_N, ctr[2], buf[2], buflen);
ctr[3] = mlk_rej_uniform(vec3->coeffs, MLKEM_N, ctr[3], buf[3], buflen);
}
mlk_xof_x4_release(&statex);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
}
MLK_INTERNAL_API
void mlk_poly_rej_uniform(mlk_poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
{
mlk_xof_ctx state;
MLK_ALIGN uint8_t buf[MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE];
unsigned ctr, buflen;
mlk_xof_init(&state);
mlk_xof_absorb(&state, seed, MLKEM_SYMBYTES + 2);
/* Initially, squeeze + sample heuristic number of MLKEM_GEN_MATRIX_NBLOCKS.
*/
/* This should generate the matrix entry with high probability. */
mlk_xof_squeezeblocks(buf, MLKEM_GEN_MATRIX_NBLOCKS, &state);
buflen = MLKEM_GEN_MATRIX_NBLOCKS * MLK_XOF_RATE;
ctr = mlk_rej_uniform(entry->coeffs, MLKEM_N, 0, buf, buflen);
/* Squeeze + sample one more block a time until we're done */
buflen = MLK_XOF_RATE;
while (ctr < MLKEM_N)
__loop__(
assigns(ctr, state, memory_slice(entry, sizeof(mlk_poly)), object_whole(buf))
invariant(ctr <= MLKEM_N)
invariant(array_bound(entry->coeffs, 0, ctr, 0, MLKEM_Q)))
{
mlk_xof_squeezeblocks(buf, 1, &state);
ctr = mlk_rej_uniform(entry->coeffs, MLKEM_N, ctr, buf, buflen);
}
mlk_xof_release(&state);
/* Specification: Partially implements
* @[FIPS203, Section 3.3, Destruction of intermediate values] */
mlk_zeroize(buf, sizeof(buf));
}
/*************************************************
* Name: mlk_load32_littleendian
*
* Description: load 4 bytes into a 32-bit integer
* in little-endian order
*
* Arguments: - const uint8_t *x: pointer to input byte array
*
* Returns 32-bit unsigned integer loaded from x
*
**************************************************/
/* Reference: `load32_littleendian()` in the reference implementation @[REF]. */
static uint32_t mlk_load32_littleendian(const uint8_t x[4])
{
uint32_t r;
r = (uint32_t)x[0];
r |= (uint32_t)x[1] << 8;
r |= (uint32_t)x[2] << 16;
r |= (uint32_t)x[3] << 24;
return r;
}
/* Reference: `cbd2()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
void mlk_poly_cbd2(mlk_poly *r, const uint8_t buf[2 * MLKEM_N / 4])
{
unsigned i;
for (i = 0; i < MLKEM_N / 8; i++)
__loop__(
invariant(i <= MLKEM_N / 8)
invariant(array_abs_bound(r->coeffs, 0, 8 * i, 3)))
{
unsigned j;
uint32_t t = mlk_load32_littleendian(buf + 4 * i);
uint32_t d = t & 0x55555555;
d += (t >> 1) & 0x55555555;
for (j = 0; j < 8; j++)
__loop__(
invariant(i <= MLKEM_N / 8 && j <= 8)
invariant(array_abs_bound(r->coeffs, 0, 8 * i + j, 3)))
{
const int16_t a = (d >> (4 * j + 0)) & 0x3;
const int16_t b = (d >> (4 * j + 2)) & 0x3;
r->coeffs[8 * i + j] = a - b;
}
}
}
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_ETA1 == 3
/*************************************************
* Name: mlk_load24_littleendian
*
* Description: load 3 bytes into a 32-bit integer
* in little-endian order.
* This function is only needed for ML-KEM-512
*
* Arguments: - const uint8_t *x: pointer to input byte array
*
* Returns 32-bit unsigned integer loaded from x (most significant byte is zero)
*
**************************************************/
/* Reference: `load24_littleendian()` in the reference implementation @[REF]. */
static uint32_t mlk_load24_littleendian(const uint8_t x[3])
{
uint32_t r;
r = (uint32_t)x[0];
r |= (uint32_t)x[1] << 8;
r |= (uint32_t)x[2] << 16;
return r;
}
/* Reference: `cbd3()` in the reference implementation @[REF]. */
MLK_INTERNAL_API
void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4])
{
unsigned i;
for (i = 0; i < MLKEM_N / 4; i++)
__loop__(
invariant(i <= MLKEM_N / 4)
invariant(array_abs_bound(r->coeffs, 0, 4 * i, 4)))
{
unsigned j;
const uint32_t t = mlk_load24_littleendian(buf + 3 * i);
uint32_t d = t & 0x00249249;
d += (t >> 1) & 0x00249249;
d += (t >> 2) & 0x00249249;
for (j = 0; j < 4; j++)
__loop__(
invariant(i <= MLKEM_N / 4 && j <= 4)
invariant(array_abs_bound(r->coeffs, 0, 4 * i + j, 4)))
{
const int16_t a = (d >> (6 * j + 0)) & 0x7;
const int16_t b = (d >> (6 * j + 3)) & 0x7;
r->coeffs[4 * i + j] = a - b;
}
}
}
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_ETA1 == 3 */
#else /* !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(sampling)
#endif /* MLK_CONFIG_MULTILEVEL_NO_SHARED */
/* To facilitate single-compilation-unit (SCU) builds, undefine all macros.
* Don't modify by hand -- this is auto-generated by scripts/autogen. */
#undef MLKEM_GEN_MATRIX_NBLOCKS

View File

@@ -0,0 +1,118 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*/
#ifndef MLK_SAMPLING_H
#define MLK_SAMPLING_H
#include <stdint.h>
#include <stdlib.h>
#include "cbmc.h"
#include "common.h"
#include "poly.h"
#define mlk_poly_cbd2 MLK_NAMESPACE(poly_cbd2)
/*************************************************
* Name: mlk_poly_cbd2
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter eta=2
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
*
* Specification: Implements @[FIPS203, Algorithm 8, SamplePolyCBD_2]
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_cbd2(mlk_poly *r, const uint8_t buf[2 * MLKEM_N / 4]);
#if defined(MLK_CONFIG_MULTILEVEL_WITH_SHARED) || MLKEM_ETA1 == 3
#define mlk_poly_cbd3 MLK_NAMESPACE(poly_cbd3)
/*************************************************
* Name: mlk_poly_cbd3
*
* Description: Given an array of uniformly random bytes, compute
* polynomial with coefficients distributed according to
* a centered binomial distribution with parameter eta=3.
* This function is only needed for ML-KEM-512
*
* Arguments: - mlk_poly *r: pointer to output polynomial
* - const uint8_t *buf: pointer to input byte array
*
* Specification: Implements @[FIPS203, Algorithm 8, SamplePolyCBD_3]
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_cbd3(mlk_poly *r, const uint8_t buf[3 * MLKEM_N / 4]);
#endif /* MLK_CONFIG_MULTILEVEL_WITH_SHARED || MLKEM_ETA1 == 3 */
#define mlk_poly_rej_uniform_x4 MLK_NAMESPACE(poly_rej_uniform_x4)
/*************************************************
* Name: mlk_poly_rej_uniform_x4
*
* Description: Generate four polynomials using rejection sampling
* on (pseudo-)uniformly random bytes sampled from a seed.
*
* Arguments: - mlk_poly *vec0, *vec1, *vec2, *vec3:
* Pointers to 4 polynomials to be sampled.
* - uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)]:
* Pointer consecutive array of seed buffers of size
* MLKEM_SYMBYTES + 2 each, plus padding for alignment.
*
* Specification: Implements @[FIPS203, Algorithm 7, SampleNTT]
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_rej_uniform_x4(mlk_poly *vec0, mlk_poly *vec1, mlk_poly *vec2,
mlk_poly *vec3,
uint8_t seed[4][MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)])
__contract__(
requires(memory_no_alias(vec0, sizeof(mlk_poly)))
requires(memory_no_alias(vec1, sizeof(mlk_poly)))
requires(memory_no_alias(vec2, sizeof(mlk_poly)))
requires(memory_no_alias(vec3, sizeof(mlk_poly)))
requires(memory_no_alias(seed, 4 * MLK_ALIGN_UP(MLKEM_SYMBYTES + 2)))
assigns(memory_slice(vec0, sizeof(mlk_poly)))
assigns(memory_slice(vec1, sizeof(mlk_poly)))
assigns(memory_slice(vec2, sizeof(mlk_poly)))
assigns(memory_slice(vec3, sizeof(mlk_poly)))
ensures(array_bound(vec0->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
ensures(array_bound(vec1->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
ensures(array_bound(vec2->coeffs, 0, MLKEM_N, 0, MLKEM_Q))
ensures(array_bound(vec3->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
#define mlk_poly_rej_uniform MLK_NAMESPACE(poly_rej_uniform)
/*************************************************
* Name: mlk_poly_rej_uniform
*
* Description: Generate polynomial using rejection sampling
* on (pseudo-)uniformly random bytes sampled from a seed.
*
* Arguments: - mlk_poly *vec: Pointer to polynomial to be sampled.
* - uint8_t *seed: Pointer to seed buffer of size
* MLKEM_SYMBYTES + 2 each.
*
* Specification: Implements @[FIPS203, Algorithm 7, SampleNTT]
*
**************************************************/
MLK_INTERNAL_API
void mlk_poly_rej_uniform(mlk_poly *entry, uint8_t seed[MLKEM_SYMBYTES + 2])
__contract__(
requires(memory_no_alias(entry, sizeof(mlk_poly)))
requires(memory_no_alias(seed, MLKEM_SYMBYTES + 2))
assigns(memory_slice(entry, sizeof(mlk_poly)))
ensures(array_bound(entry->coeffs, 0, MLKEM_N, 0, MLKEM_Q)));
#endif /* !MLK_SAMPLING_H */

View File

@@ -0,0 +1,69 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*/
#ifndef MLK_SYMMETRIC_H
#define MLK_SYMMETRIC_H
#include <stddef.h>
#include <stdint.h>
#include "cbmc.h"
#include "common.h"
#include MLK_FIPS202_HEADER_FILE
#include MLK_FIPS202X4_HEADER_FILE
/* Macros denoting FIPS 203 specific Hash functions */
/* Hash function H, @[FIPS203, Section 4.1, Eq (4.4)] */
#define mlk_hash_h(OUT, IN, INBYTES) mlk_sha3_256(OUT, IN, INBYTES)
/* Hash function G, @[FIPS203, Section 4.1, Eq (4.5)] */
#define mlk_hash_g(OUT, IN, INBYTES) mlk_sha3_512(OUT, IN, INBYTES)
/* Hash function J, @[FIPS203, Section 4.1, Eq (4.4)] */
#define mlk_hash_j(OUT, IN, INBYTES) \
mlk_shake256(OUT, MLKEM_SYMBYTES, IN, INBYTES)
/* PRF function, @[FIPS203, Section 4.1, Eq (4.3)]
* Referring to (eq 4.3), `OUT` is assumed to contain `s || b`. */
#define mlk_prf_eta(ETA, OUT, IN) \
mlk_shake256(OUT, (ETA) * MLKEM_N / 4, IN, MLKEM_SYMBYTES + 1)
#define mlk_prf_eta1(OUT, IN) mlk_prf_eta(MLKEM_ETA1, OUT, IN)
#define mlk_prf_eta2(OUT, IN) mlk_prf_eta(MLKEM_ETA2, OUT, IN)
#define mlk_prf_eta1_x4(OUT, IN) \
mlk_shake256x4((OUT)[0], (OUT)[1], (OUT)[2], (OUT)[3], \
(MLKEM_ETA1 * MLKEM_N / 4), (IN)[0], (IN)[1], (IN)[2], \
(IN)[3], MLKEM_SYMBYTES + 1)
/* XOF function, FIPS 203 4.1 */
#define mlk_xof_ctx mlk_shake128ctx
#define mlk_xof_x4_ctx mlk_shake128x4ctx
#define mlk_xof_init(CTX) mlk_shake128_init((CTX))
#define mlk_xof_absorb(CTX, IN, INBYTES) \
mlk_shake128_absorb_once((CTX), (IN), (INBYTES))
#define mlk_xof_squeezeblocks(BUF, NBLOCKS, CTX) \
mlk_shake128_squeezeblocks((BUF), (NBLOCKS), (CTX))
#define mlk_xof_release(CTX) mlk_shake128_release((CTX))
#define mlk_xof_x4_init(CTX) mlk_shake128x4_init((CTX))
#define mlk_xof_x4_absorb(CTX, IN, INBYTES) \
mlk_shake128x4_absorb_once((CTX), (IN)[0], (IN)[1], (IN)[2], (IN)[3], \
(INBYTES))
#define mlk_xof_x4_squeezeblocks(BUF, NBLOCKS, CTX) \
mlk_shake128x4_squeezeblocks((BUF)[0], (BUF)[1], (BUF)[2], (BUF)[3], \
(NBLOCKS), (CTX))
#define mlk_xof_x4_release(CTX) mlk_shake128x4_release((CTX))
#define MLK_XOF_RATE SHAKE128_RATE
#endif /* !MLK_SYMMETRIC_H */

View File

@@ -0,0 +1,233 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#ifndef MLK_SYS_H
#define MLK_SYS_H
#if !defined(MLK_CONFIG_NO_ASM) && (defined(__GNUC__) || defined(__clang__))
#define MLK_HAVE_INLINE_ASM
#endif
/* Try to find endianness, if not forced through CFLAGS already */
#if !defined(MLK_SYS_LITTLE_ENDIAN) && !defined(MLK_SYS_BIG_ENDIAN)
#if defined(__BYTE_ORDER__)
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define MLK_SYS_LITTLE_ENDIAN
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
#define MLK_SYS_BIG_ENDIAN
#else
#error "__BYTE_ORDER__ defined, but don't recognize value."
#endif
#endif /* __BYTE_ORDER__ */
#endif /* !MLK_SYS_LITTLE_ENDIAN && !MLK_SYS_BIG_ENDIAN */
/* Check if we're running on an AArch64 little endian system. _M_ARM64 is set by
* MSVC. */
#if defined(__AARCH64EL__) || defined(_M_ARM64)
#define MLK_SYS_AARCH64
#endif
/* Check if we're running on an AArch64 big endian system. */
#if defined(__AARCH64EB__)
#define MLK_SYS_AARCH64_EB
#endif
#if defined(__x86_64__)
#define MLK_SYS_X86_64
#if defined(__AVX2__)
#define MLK_SYS_X86_64_AVX2
#endif
#endif /* __x86_64__ */
#if defined(MLK_SYS_LITTLE_ENDIAN) && defined(__powerpc64__)
#define MLK_SYS_PPC64LE
#endif
#if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 64
#define MLK_SYS_RISCV64
#endif
#if defined(__riscv) && defined(__riscv_xlen) && __riscv_xlen == 32
#define MLK_SYS_RISCV32
#endif
#if defined(_WIN32)
#define MLK_SYS_WINDOWS
#endif
#if defined(__linux__)
#define MLK_SYS_LINUX
#endif
#if defined(__APPLE__)
#define MLK_SYS_APPLE
#endif
#if defined(MLK_FORCE_AARCH64) && !defined(MLK_SYS_AARCH64)
#error "MLK_FORCE_AARCH64 is set, but we don't seem to be on an AArch64 system."
#endif
#if defined(MLK_FORCE_AARCH64_EB) && !defined(MLK_SYS_AARCH64_EB)
#error \
"MLK_FORCE_AARCH64_EB is set, but we don't seem to be on an AArch64 system."
#endif
#if defined(MLK_FORCE_X86_64) && !defined(MLK_SYS_X86_64)
#error "MLK_FORCE_X86_64 is set, but we don't seem to be on an X86_64 system."
#endif
#if defined(MLK_FORCE_PPC64LE) && !defined(MLK_SYS_PPC64LE)
#error "MLK_FORCE_PPC64LE is set, but we don't seem to be on a PPC64LE system."
#endif
#if defined(MLK_FORCE_RISCV64) && !defined(MLK_SYS_RISCV64)
#error "MLK_FORCE_RISCV64 is set, but we don't seem to be on a RISCV64 system."
#endif
#if defined(MLK_FORCE_RISCV32) && !defined(MLK_SYS_RISCV32)
#error "MLK_FORCE_RISCV32 is set, but we don't seem to be on a RISCV32 system."
#endif
/*
* C90 does not have the inline compiler directive yet.
* We don't use it in C90 builds.
* However, in that case the compiler warns about some inline functions in
* header files not being used in every compilation unit that includes that
* header. To work around it we silence that warning in that case using
* __attribute__((unused)).
*/
/* Do not use inline for C90 builds*/
#if !defined(MLK_INLINE)
#if !defined(inline)
#if defined(_MSC_VER)
#define MLK_INLINE __inline
/* Don't combine __inline and __forceinline */
#define MLK_ALWAYS_INLINE __forceinline
#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
#define MLK_INLINE inline
#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
#else
#define MLK_INLINE __attribute__((unused))
#define MLK_ALWAYS_INLINE MLK_INLINE
#endif
#else /* !inline */
#define MLK_INLINE inline
#define MLK_ALWAYS_INLINE MLK_INLINE __attribute__((always_inline))
#endif /* inline */
#endif /* !MLK_INLINE */
/*
* C90 does not have the restrict compiler directive yet.
* We don't use it in C90 builds.
*/
#if !defined(restrict)
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
#define MLK_RESTRICT restrict
#else
#define MLK_RESTRICT
#endif
#else /* !restrict */
#define MLK_RESTRICT restrict
#endif /* restrict */
#define MLK_DEFAULT_ALIGN 32
#define MLK_ALIGN_UP(N) \
((((N) + (MLK_DEFAULT_ALIGN - 1)) / MLK_DEFAULT_ALIGN) * MLK_DEFAULT_ALIGN)
#if defined(__GNUC__)
#define MLK_ALIGN __attribute__((aligned(MLK_DEFAULT_ALIGN)))
#elif defined(_MSC_VER)
#define MLK_ALIGN __declspec(align(MLK_DEFAULT_ALIGN))
#else
#define MLK_ALIGN /* No known support for alignment constraints */
#endif
/* New X86_64 CPUs support Conflow-flow protection using the CET instructions.
* When enabled (through -fcf-protection=), all compilation units (including
* empty ones) need to support CET for this to work.
* For assembly, this means that source files need to signal support for
* CET by setting the appropriate note.gnu.property section.
* This can be achieved by including the <cet.h> header in all assembly file.
* This file also provides the _CET_ENDBR macro which needs to be placed at
* every potential target of an indirect branch.
* If CET is enabled _CET_ENDBR maps to the endbr64 instruction, otherwise
* it is empty.
* In case the compiler does not support CET (e.g., <gcc8, <clang11),
* the __CET__ macro is not set and we default to nothing.
* Note that we only issue _CET_ENDBR instructions through the MLK_ASM_FN_SYMBOL
* macro as the global symbols are the only possible targets of indirect
* branches in our code.
*/
#if defined(MLK_SYS_X86_64)
#if defined(__CET__)
#include <cet.h>
#define MLK_CET_ENDBR _CET_ENDBR
#else
#define MLK_CET_ENDBR
#endif
#endif /* MLK_SYS_X86_64 */
#if defined(MLK_CONFIG_CT_TESTING_ENABLED) && !defined(__ASSEMBLER__)
#include <valgrind/memcheck.h>
#define MLK_CT_TESTING_SECRET(ptr, len) \
VALGRIND_MAKE_MEM_UNDEFINED((ptr), (len))
#define MLK_CT_TESTING_DECLASSIFY(ptr, len) \
VALGRIND_MAKE_MEM_DEFINED((ptr), (len))
#else /* MLK_CONFIG_CT_TESTING_ENABLED && !__ASSEMBLER__ */
#define MLK_CT_TESTING_SECRET(ptr, len) \
do \
{ \
} while (0)
#define MLK_CT_TESTING_DECLASSIFY(ptr, len) \
do \
{ \
} while (0)
#endif /* !(MLK_CONFIG_CT_TESTING_ENABLED && !__ASSEMBLER__) */
#if defined(__GNUC__) || defined(clang)
#define MLK_MUST_CHECK_RETURN_VALUE __attribute__((warn_unused_result))
#else
#define MLK_MUST_CHECK_RETURN_VALUE
#endif
#if !defined(__ASSEMBLER__)
/* System capability enumeration */
typedef enum
{
#if defined(MLK_SYS_X86_64)
MLK_SYS_CAP_AVX2
#elif defined(MLK_SYS_AARCH64)
MLK_SYS_CAP_SHA3
#else
/* C90 does not allow empty enums, so use a dummy value
* for architectures other than AArch64 and x86_64. */
MLK_SYS_CAP_DUMMY
#endif
} mlk_sys_cap;
#if !defined(MLK_CONFIG_CUSTOM_CAPABILITY_FUNC)
#include "cbmc.h"
static MLK_INLINE int mlk_sys_check_capability(mlk_sys_cap cap)
__contract__(
ensures(return_value == 0 || return_value == 1)
)
{
/* By default, we rely on compile-time feature detection/specification:
* If a feature is enabled at compile-time, we assume it is supported by
* the host that the resulting library/binary will be built on.
* If this assumption is not true, you MUST overwrite this function.
* See the documentation of MLK_CONFIG_CUSTOM_CAPABILITY_FUNC in config.h
* for more information. */
(void)cap;
return 1;
}
#endif /* !MLK_CONFIG_CUSTOM_CAPABILITY_FUNC */
#endif /* !__ASSEMBLER__ */
#endif /* !MLK_SYS_H */

View File

@@ -0,0 +1,20 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
#include "verify.h"
#if !defined(MLK_USE_ASM_VALUE_BARRIER) && \
!defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
/*
* Masking value used in constant-time functions from
* verify.h to block the compiler's range analysis and
* thereby reduce the risk of compiler-introduced branches.
*/
volatile uint64_t mlk_ct_opt_blocker_u64 = 0;
#else /* !MLK_USE_ASM_VALUE_BARRIER && !MLK_CONFIG_MULTILEVEL_NO_SHARED */
MLK_EMPTY_CU(verify)
#endif /* !(!MLK_USE_ASM_VALUE_BARRIER && !MLK_CONFIG_MULTILEVEL_NO_SHARED) */

View File

@@ -0,0 +1,447 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/* References
* ==========
*
* - [FIPS203]
* FIPS 203 Module-Lattice-Based Key-Encapsulation Mechanism Standard
* National Institute of Standards and Technology
* https://csrc.nist.gov/pubs/fips/203/final
*
* - [REF]
* CRYSTALS-Kyber C reference implementation
* Bos, Ducas, Kiltz, Lepoint, Lyubashevsky, Schanck, Schwabe, Seiler, Stehlé
* https://github.com/pq-crystals/kyber/tree/main/ref
*
* - [libmceliece]
* libmceliece implementation of Classic McEliece
* Bernstein, Chou
* https://lib.mceliece.org/
*
* - [optblocker]
* PQC forum post on opt-blockers using volatile globals
* Daniel J. Bernstein
* https://groups.google.com/a/list.nist.gov/g/pqc-forum/c/hqbtIGFKIpU/m/H14H0wOlBgAJ
*/
#ifndef MLK_VERIFY_H
#define MLK_VERIFY_H
#include <limits.h>
#include <stddef.h>
#include <stdint.h>
#include "cbmc.h"
#include "common.h"
/* Constant-time comparisons and conditional operations
We reduce the risk for compilation into variable-time code
through the use of 'value barriers'.
Functionally, a value barrier is a no-op. To the compiler, however,
it constitutes an arbitrary modification of its input, and therefore
harden's value propagation and range analysis.
We consider two approaches to implement a value barrier:
- An empty inline asm block which marks the target value as clobbered.
- XOR'ing with the value of a volatile global that's set to 0;
see @[optblocker] for a discussion of this idea, and
@[libmceliece, inttypes/crypto_intN.h] for an implementation.
The first approach is cheap because it only prevents the compiler
from reasoning about the value of the variable past the barrier,
but does not directly generate additional instructions.
The second approach generates redundant loads and XOR operations
and therefore comes at a higher runtime cost. However, it appears
more robust towards optimization, as compilers should never drop
a volatile load.
We use the empty-ASM value barrier for GCC and clang, and fall
back to the global volatile barrier otherwise.
The global value barrier can be forced by setting
MLK_CONFIG_NO_ASM_VALUE_BARRIER.
*/
#if defined(MLK_HAVE_INLINE_ASM) && !defined(MLK_CONFIG_NO_ASM_VALUE_BARRIER)
#define MLK_USE_ASM_VALUE_BARRIER
#endif
#if !defined(MLK_USE_ASM_VALUE_BARRIER)
/*
* Declaration of global volatile that the global value barrier
* is loading from and masking with.
*/
#define mlk_ct_opt_blocker_u64 MLK_NAMESPACE(ct_opt_blocker_u64)
extern volatile uint64_t mlk_ct_opt_blocker_u64;
/* Helper functions for obtaining global masks of various sizes */
/* This contract is not proved but treated as an axiom.
*
* Its validity relies on the assumption that the global opt-blocker
* constant mlk_ct_opt_blocker_u64 is not modified.
*/
static MLK_INLINE uint64_t mlk_ct_get_optblocker_u64(void)
__contract__(ensures(return_value == 0)) { return mlk_ct_opt_blocker_u64; }
static MLK_INLINE uint8_t mlk_ct_get_optblocker_u8(void)
__contract__(ensures(return_value == 0)) { return (uint8_t)mlk_ct_get_optblocker_u64(); }
static MLK_INLINE uint32_t mlk_ct_get_optblocker_u32(void)
__contract__(ensures(return_value == 0)) { return (uint32_t)mlk_ct_get_optblocker_u64(); }
static MLK_INLINE int32_t mlk_ct_get_optblocker_i32(void)
__contract__(ensures(return_value == 0)) { return (int32_t)mlk_ct_get_optblocker_u64(); }
/* Opt-blocker based implementation of value barriers */
static MLK_INLINE uint32_t mlk_value_barrier_u32(uint32_t b)
__contract__(ensures(return_value == b)) { return (b ^ mlk_ct_get_optblocker_u32()); }
static MLK_INLINE int32_t mlk_value_barrier_i32(int32_t b)
__contract__(ensures(return_value == b)) { return (b ^ mlk_ct_get_optblocker_i32()); }
static MLK_INLINE uint8_t mlk_value_barrier_u8(uint8_t b)
__contract__(ensures(return_value == b)) { return (b ^ mlk_ct_get_optblocker_u8()); }
#else /* !MLK_USE_ASM_VALUE_BARRIER */
static MLK_INLINE uint32_t mlk_value_barrier_u32(uint32_t b)
__contract__(ensures(return_value == b))
{
__asm__("" : "+r"(b));
return b;
}
static MLK_INLINE int32_t mlk_value_barrier_i32(int32_t b)
__contract__(ensures(return_value == b))
{
__asm__("" : "+r"(b));
return b;
}
static MLK_INLINE uint8_t mlk_value_barrier_u8(uint8_t b)
__contract__(ensures(return_value == b))
{
__asm__("" : "+r"(b));
return b;
}
#endif /* MLK_USE_ASM_VALUE_BARRIER */
/*
* The ct_cmask_nonzero_xxx functions below make deliberate use of unsigned
* overflow, which is fully defined behaviour in C. It is thus safe to disable
* this warning.
*/
#ifdef CBMC
#pragma CPROVER check push
#pragma CPROVER check disable "unsigned-overflow"
#endif
/*************************************************
* Name: mlk_ct_cmask_nonzero_u16
*
* Description: Return 0 if input is zero, and -1 otherwise.
*
* Arguments: uint16_t x: Value to be converted into a mask
*
**************************************************/
/* Reference: Embedded in `cmov_int16()` in the reference implementation @[REF].
* - Use value barrier and shift instead of `b = -b` to
* convert condition into mask. */
static MLK_INLINE uint16_t mlk_ct_cmask_nonzero_u16(uint16_t x)
__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFFFF)))
{
uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
tmp >>= 16;
return tmp;
}
/*************************************************
* Name: mlk_ct_cmask_nonzero_u8
*
* Description: Return 0 if input is zero, and -1 otherwise.
*
* Arguments: uint8_t x: Value to be converted into a mask
*
**************************************************/
/* Reference: Embedded in `verify()` and `cmov()` in the
* reference implementation @[REF].
* - We include a value barrier not present in the
* reference implementation, to prevent the compiler
* from realizing that this function returns a mask. */
static MLK_INLINE uint8_t mlk_ct_cmask_nonzero_u8(uint8_t x)
__contract__(ensures(return_value == ((x == 0) ? 0 : 0xFF)))
{
uint32_t tmp = mlk_value_barrier_u32(-((uint32_t)x));
tmp >>= 24;
return tmp;
}
/* Put unsigned overflow warnings in CBMC back into scope */
#ifdef CBMC
#pragma CPROVER check pop
#endif
/*
* The mlk_ct_cmask_neg_i16 function below makes deliberate use of
* signed to unsigned integer conversion, which is fully defined
* behaviour in C. It is thus safe to disable this warning.
*/
#ifdef CBMC
#pragma CPROVER check push
#pragma CPROVER check disable "conversion"
#endif
/*************************************************
* Name: mlk_ct_cmask_neg_i16
*
* Description: Return 0 if input is non-negative, and -1 otherwise.
*
* Arguments: uint16_t x: Value to be converted into a mask
*
**************************************************/
/* Reference: Embedded in polynomial compression function in the
* reference implementation @[REF].
* - Used as part of signed->unsigned conversion for modular
* representatives to detect whether the input is negative.
* This happen in `mlk_poly_reduce()` here, and as part of
* polynomial compression functions in the reference
* implementation. See `mlk_poly_reduce()`.
* - We use value barriers to reduce the risk of
* compiler-introduced branches. */
static MLK_INLINE uint16_t mlk_ct_cmask_neg_i16(int16_t x)
__contract__(ensures(return_value == ((x < 0) ? 0xFFFF : 0)))
{
int32_t tmp = mlk_value_barrier_i32((int32_t)x);
tmp >>= 16;
return (int16_t)tmp;
}
/* Put unsigned-to-signed warnings in CBMC back into scope */
#ifdef CBMC
#pragma CPROVER check pop
#endif
/*
* The ct_csel_xxx functions below make deliberate use of unsigned
* to signed integer conversion, which is implementation-defined
* behaviour. Here, we assume that uint16_t -> int16_t is inverse
* to int16_t -> uint16_t.
*/
#ifdef CBMC
#pragma CPROVER check push
#pragma CPROVER check disable "conversion"
#endif
/*************************************************
* Name: mlk_ct_sel_int16
*
* Description: Functionally equivalent to cond ? a : b,
* but implemented with guards against
* compiler-introduced branches.
*
* Arguments: int16_t a: First alternative
* int16_t b: Second alternative
* uint16_t cond: Condition variable.
*
* Specification:
* - With `a = MLKEM_Q_HALF` and `b=0`, this essentially
* implements `Decompress_1` @[FIPS203, Eq (4.8)] in `mlk_poly_frommsg()`.
* - With `a = x + MLKEM_Q`, `b = x`, and `cond` indicating whether `x`
* is negative, implements signed->unsigned conversion of modular
* representatives. Questions of representation are not considered
* in the specification @[FIPS203, Section 2.4.1, "The pseudocode is
* agnostic regarding how an integer modulo 𝑚 is represented in
* actual implementations"].
*
**************************************************/
/* Reference: Embedded in polynomial compression function in the
* reference implementation @[REF].
* - Used as part of signed->unsigned conversion for modular
* representatives. This happen in `mlk_poly_reduce()` here,
* and as part of polynomial compression functions in @[REF].
* See `mlk_poly_reduce()`.
* - Barrier to reduce the risk of compiler-introduced branches.
* For `a = MLKEM_Q_HALF` and `b=0`, also embedded in
* `poly_frommsg()` from the reference implementation, which uses
* `cmov_int16()` instead. */
static MLK_INLINE int16_t mlk_ct_sel_int16(int16_t a, int16_t b, uint16_t cond)
__contract__(ensures(return_value == (cond ? a : b)))
{
uint16_t au = a, bu = b;
uint16_t res = bu ^ (mlk_ct_cmask_nonzero_u16(cond) & (au ^ bu));
return (int16_t)res;
}
/* Put unsigned-to-signed warnings in CBMC back into scope */
#ifdef CBMC
#pragma CPROVER check pop
#endif
/*************************************************
* Name: mlk_ct_sel_uint8
*
* Description: Functionally equivalent to cond ? a : b,
* but implemented with guards against
* compiler-introduced branches.
*
* Arguments: uint8_t a: First alternative
* uint8_t b: Second alternative
* uuint8_t cond: Condition variable.
*
**************************************************/
/* Reference: Embedded into `cmov()` in the reference implementation @[REF].
* - Use value barrier to get mask from condition value. */
static MLK_INLINE uint8_t mlk_ct_sel_uint8(uint8_t a, uint8_t b, uint8_t cond)
__contract__(ensures(return_value == (cond ? a : b)))
{
return b ^ (mlk_ct_cmask_nonzero_u8(cond) & (a ^ b));
}
/*************************************************
* Name: mlk_ct_memcmp
*
* Description: Compare two arrays for equality in constant time.
*
* Arguments: const uint8_t *a: pointer to first byte array
* const uint8_t *b: pointer to second byte array
* size_t len: length of the byte arrays
*
* Returns 0 if the byte arrays are equal, a non-zero value otherwise
*
* Specification:
* - Used to securely compute conditional move in
* @[FIPS203, Algorithm 18 (ML-KEM.Decaps_Internal, L9-11]
*
**************************************************/
/* Reference: `cmov()` in the reference implementation @[REF]
* - We return `uint8_t`, not `int`.
* - We use an additional XOR-accumulator in the comparison loop
* which prevents early abort if the OR-accumulator is 0xFF.
* - We use a value barrier to convert the OR-accumulator into
* a mask. The reference implementation uses a shift which the
* compiler can argue to result in either 0 of 0xFF..FF. */
static MLK_INLINE uint8_t mlk_ct_memcmp(const uint8_t *a, const uint8_t *b,
const size_t len)
__contract__(
requires(memory_no_alias(a, len))
requires(memory_no_alias(b, len))
requires(len <= INT_MAX)
ensures((return_value == 0) == forall(i, 0, len, (a[i] == b[i]))))
{
uint8_t r = 0, s = 0;
unsigned i;
for (i = 0; i < len; i++)
__loop__(
invariant(i <= len)
invariant((r == 0) == (forall(k, 0, i, (a[k] == b[k])))))
{
r |= a[i] ^ b[i];
/* s is useless, but prevents the loop from being aborted once r=0xff. */
s ^= a[i] ^ b[i];
}
/*
* - Convert r into a mask; this may not be necessary, but is an additional
* safeguard
* towards leaking information about a and b.
* - XOR twice with s, separated by a value barrier, to prevent the compile
* from dropping the s computation in the loop.
*/
return (mlk_value_barrier_u8(mlk_ct_cmask_nonzero_u8(r) ^ s) ^ s);
}
/*************************************************
* Name: mlk_ct_cmov_zero
*
* Description: Copy len bytes from x to r if b is zero;
* don't modify x if b is non-zero.
* assumes two's complement representation of negative integers.
* Runs in constant time.
*
* Arguments: uint8_t *r: pointer to output byte array
* const uint8_t *x: pointer to input byte array
* size_t len: Amount of bytes to be copied
* uint8_t b: Condition value.
*
* Specification:
* - Used to securely compute conditional move in
* @[FIPS203, Algorithm 18 (ML-KEM.Decaps_Internal, L9-11]
*
**************************************************/
/* Reference: `cmov()` in the reference implementation @[REF].
* - We move if condition value is `0`, not `1`.
* - We use `mlk_ct_sel_uint8` for constant-time selection. */
static MLK_INLINE void mlk_ct_cmov_zero(uint8_t *r, const uint8_t *x,
size_t len, uint8_t b)
__contract__(
requires(memory_no_alias(r, len))
requires(memory_no_alias(x, len))
assigns(memory_slice(r, len)))
{
size_t i;
for (i = 0; i < len; i++)
__loop__(invariant(i <= len))
{
r[i] = mlk_ct_sel_uint8(r[i], x[i], b);
}
}
/*************************************************
* Name: mlk_zeroize
*
* Description: Force-zeroize a buffer.
*
* Arguments: uint8_t *r: pointer to byte array to be zeroed
* size_t len: Amount of bytes to be zeroed
*
* Specification: Used to implement
* @[FIPS203, Section 3.3, Destruction of intermediate values]
*
**************************************************/
/* Reference: Not present in the reference implementation @[REF]. */
#if !defined(MLK_CONFIG_CUSTOM_ZEROIZE)
#if defined(MLK_SYS_WINDOWS)
#include <windows.h>
static MLK_INLINE void mlk_zeroize(void *ptr, size_t len)
__contract__(
requires(memory_no_alias(ptr, len))
assigns(memory_slice(ptr, len))) { SecureZeroMemory(ptr, len); }
#elif defined(MLK_HAVE_INLINE_ASM)
#include <string.h>
static MLK_INLINE void mlk_zeroize(void *ptr, size_t len)
__contract__(
requires(memory_no_alias(ptr, len))
assigns(memory_slice(ptr, len)))
{
mlk_memset(ptr, 0, len);
/* This follows OpenSSL and seems sufficient to prevent the compiler
* from optimizing away the memset.
*
* If there was a reliable way to detect availability of memset_s(),
* that would be preferred. */
__asm__ __volatile__("" : : "r"(ptr) : "memory");
}
#else /* !MLK_SYS_WINDOWS && MLK_HAVE_INLINE_ASM */
#error No plausibly-secure implementation of mlk_zeroize available. Please provide your own using MLK_CONFIG_CUSTOM_ZEROIZE.
#endif /* !MLK_SYS_WINDOWS && !MLK_HAVE_INLINE_ASM */
#endif /* !MLK_CONFIG_CUSTOM_ZEROIZE */
#endif /* !MLK_VERIFY_H */

View File

@@ -0,0 +1,31 @@
/*
* Copyright (c) The mlkem-native project authors
* SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT
*/
/*
* WARNING: This file is auto-generated from scripts/autogen
* in the mlkem-native repository.
* Do not modify it directly.
*/
#include <stdint.h>
/*
* Table of zeta values used in the reference NTT and inverse NTT.
* See autogen for details.
*/
static MLK_ALIGN const int16_t mlk_zetas[128] = {
-1044, -758, -359, -1517, 1493, 1422, 287, 202, -171, 622, 1577,
182, 962, -1202, -1474, 1468, 573, -1325, 264, 383, -829, 1458,
-1602, -130, -681, 1017, 732, 608, -1542, 411, -205, -1571, 1223,
652, -552, 1015, -1293, 1491, -282, -1544, 516, -8, -320, -666,
-1618, -1162, 126, 1469, -853, -90, -271, 830, 107, -1421, -247,
-951, -398, 961, -1508, -725, 448, -1065, 677, -1275, -1103, 430,
555, 843, -1251, 871, 1550, 105, 422, 587, 177, -235, -291,
-460, 1574, 1653, -246, 778, 1159, -147, -777, 1483, -602, 1119,
-1590, 644, -872, 349, 418, 329, -156, -75, 817, 1097, 603,
610, 1322, -1285, -1465, 384, -1215, -136, 1218, -1335, -874, 220,
-1187, -1659, -1185, -1530, -1278, 794, -1510, -854, -870, 478, -108,
-308, 996, 991, 958, -1460, 1522, 1628,
};

View File

@@ -0,0 +1,21 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC
#ifndef MLKEM_NATIVE_BACKEND_H
#define MLKEM_NATIVE_BACKEND_H
#include <openssl/target.h>
// For now, we only include an AArch64 backend, used on Linux and MacOS systems
#if !defined(OPENSSL_NO_ASM) && \
(defined(OPENSSL_LINUX) || defined(OPENSSL_APPLE))
#if defined(OPENSSL_AARCH64)
#include "mlkem/native/aarch64/meta.h"
#elif defined(OPENSSL_X86_64) && !defined(MY_ASSEMBLER_IS_TOO_OLD_FOR_AVX)
#include "mlkem/native/x86_64/meta.h"
#endif
#endif
#endif /* MLKEM_NATIVE_BACKEND_H */

View File

@@ -0,0 +1,114 @@
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0 OR ISC
#ifndef MLK_CONFIG_H
#define MLK_CONFIG_H
#if !defined(__ASSEMBLER__)
#include "../../internal.h"
#endif
// Namespacing: All symbols are of the form mlkem*. Level-specific
// symbols are further prefixed with their security level, e.g.
// mlkem512*, mlkem768*, mlkem1024*.
#define MLK_CONFIG_NAMESPACE_PREFIX mlkem
// Replace mlkem-native's FIPS 202 headers with glue code to
// AWS-LC's own FIPS 202 implementation.
#define MLK_CONFIG_FIPS202_CUSTOM_HEADER "../fips202_glue.h"
#define MLK_CONFIG_FIPS202X4_CUSTOM_HEADER "../fips202x4_glue.h"
// Everything is built in a single CU, so both internal and external
// mlkem-native API can have internal linkage.
#define MLK_CONFIG_INTERNAL_API_QUALIFIER static
#define MLK_CONFIG_EXTERNAL_API_QUALIFIER static
// Enable PCT if and only if AWS-LC is built in FIPS-mode.
#if defined(AWSLC_FIPS)
#define MLK_CONFIG_KEYGEN_PCT
#endif
// Map the CPU capability function to the ones used by AWS-LC
#define MLK_CONFIG_CUSTOM_CAPABILITY_FUNC
#if !defined(__ASSEMBLER__)
#include <stdint.h>
#include "mlkem/sys.h"
static MLK_INLINE int mlk_sys_check_capability(mlk_sys_cap cap)
{
#if defined(MLK_SYS_X86_64)
if (cap == MLK_SYS_CAP_AVX2)
{
return CRYPTO_is_AVX2_capable();
}
#endif
return 0;
}
#endif
#if defined(BORINGSSL_FIPS_BREAK_TESTS)
#define MLK_CONFIG_KEYGEN_PCT_BREAKAGE_TEST
#if !defined(__ASSEMBLER__) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include "mlkem/sys.h"
static MLK_INLINE int mlk_break_pct(void) {
return boringssl_fips_break_test("MLKEM_PWCT");
}
#endif // !__ASSEMBLER__
#endif // BORINGSSL_FIPS_BREAK_TESTS
// Enable valgrind-based assertions in mlkem-native through macro
// from AWS-LC/BoringSSL.
#if defined(BORINGSSL_CONSTANT_TIME_VALIDATION)
#define MLK_CONFIG_CT_TESTING_ENABLED
#endif
// Map zeroization function to the one used by AWS-LC
#define MLK_CONFIG_CUSTOM_ZEROIZE
#if !defined(__ASSEMBLER__) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include <stdint.h>
#include "mlkem/sys.h"
#include <openssl/base.h>
static MLK_INLINE void mlk_zeroize(void *ptr, size_t len) {
OPENSSL_cleanse(ptr, len);
}
#endif // !__ASSEMBLER__
// Map randombytes function to the one used by AWS-LC
#define MLK_CONFIG_CUSTOM_RANDOMBYTES
#if !defined(__ASSEMBLER__) && !defined(MLK_CONFIG_MULTILEVEL_NO_SHARED)
#include <stdint.h>
#include "mlkem/sys.h"
#include <openssl/rand.h>
static MLK_INLINE void mlk_randombytes(void *ptr, size_t len) {
AWSLC_ABORT_IF_NOT_ONE(RAND_bytes(ptr, len));
}
#endif // !__ASSEMBLER__
// Map memcpy function to the one used by AWS-LC
#define MLK_CONFIG_CUSTOM_MEMCPY
#if !defined(__ASSEMBLER__)
#include <stdint.h>
#include "mlkem/sys.h"
static MLK_INLINE void *mlk_memcpy(void *dest, const void *src, size_t n) {
return OPENSSL_memcpy(dest, src, n);
}
#endif // !__ASSEMBLER__
// Map memset function to the one used by AWS-LC
#define MLK_CONFIG_CUSTOM_MEMSET
#if !defined(__ASSEMBLER__)
#include <stdint.h>
#include "mlkem/sys.h"
static MLK_INLINE void *mlk_memset(void *s, int c, size_t n) {
return OPENSSL_memset(s, c, n);
}
#endif // !__ASSEMBLER__
#if defined(OPENSSL_NO_ASM)
#define MLK_CONFIG_NO_ASM
#endif
// Enable AArch64 arithmetic backend and set path
#define MLK_CONFIG_USE_NATIVE_BACKEND_ARITH
#define MLK_CONFIG_ARITH_BACKEND_FILE "../mlkem_native_backend.h"
#endif // MLkEM_NATIVE_CONFIG_H